diff --git a/.github/dependabot.yml b/.github/dependabot.yml index eb9008bdaa..a4a10fee69 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -23,11 +23,14 @@ updates: - go - release-chore schedule: - interval: weekly + interval: monthly day: monday time: "03:00" timezone: America/Los_Angeles target-branch: develop + ignore: + - dependency-name: "google.golang.org/api" + - package-ecosystem: pip directory: /community/front-end/ofe/ labels: @@ -45,3 +48,18 @@ updates: # Disable version updates, do security updates only # See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file open-pull-requests-limit: 0 +- package-ecosystem: pip + directory: /community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/ + labels: + - dependencies + - python + - release-chore + schedule: + interval: weekly + day: monday + time: "03:00" + timezone: America/Los_Angeles + target-branch: develop + # Disable version updates, do security updates only + # See https://docs.github.com/en/code-security/dependabot/dependabot-security-updates/configuring-dependabot-security-updates#overriding-the-default-behavior-with-a-configuration-file + open-pull-requests-limit: 0 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index f3c224983f..c51f91e39d 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,5 +1,7 @@ ### Submission Checklist +NOTE: Community submissions can take up to 2 weeks to be reviewed. + Please take the following actions before submitting this pull request. * Fork your PR branch from the Toolkit "develop" branch (not main) diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml index e1f56d0007..37234d2a0e 100644 --- a/.github/workflows/pr-precommit.yml +++ b/.github/workflows/pr-precommit.yml @@ -38,10 +38,6 @@ jobs: python-version: '3.10' check-latest: true cache: 'pip' - - run: > - pip install - -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt - -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt - uses: actions/setup-go@v5 with: go-version: '1.22' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7cecfd5809..42b6a6f041 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -76,7 +76,7 @@ repos: require_serial: true - id: pytest-check name: pytest-check - entry: pytest + entry: python -m pytest language: system types: [python] pass_filenames: false diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6272489dae..03bfefa2d0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -20,7 +20,11 @@ again. All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more -information on using pull requests. +information on pull requests. + +### Standard PR Response Times + +Community submissions can take up to 2 weeks to be reviewed. ## Community Guidelines diff --git a/Makefile b/Makefile index 1222be60d1..02bf23f7cd 100644 --- a/Makefile +++ b/Makefile @@ -69,6 +69,8 @@ install-dev-deps: warn-terraform-version warn-packer-version check-pre-commit ch go install mvdan.cc/sh/v3/cmd/shfmt@latest go install golang.org/x/tools/cmd/goimports@latest go install honnef.co/go/tools/cmd/staticcheck@latest + pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/requirements.txt + pip install -r community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt # RULES SUPPORTING THE ABOVE diff --git a/cmd/root.go b/cmd/root.go index 106be3bdf8..6ce5c00e5c 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`, logging.Fatal("cmd.Help function failed: %s", err) } }, - Version: "v1.38.0", + Version: "v1.39.0", Annotations: annotation, } ) diff --git a/community/examples/AMD/README.md b/community/examples/AMD/README.md index f0f67d367d..ffc25e2598 100644 --- a/community/examples/AMD/README.md +++ b/community/examples/AMD/README.md @@ -53,10 +53,10 @@ using the `compute` partition, you may ignore its quota requirements. ### Deploying the Blueprint -Use `ghpc` to provision the blueprint, supplying your project ID: +Use `gcluster` to provision the blueprint, supplying your project ID: ```shell -ghpc create --vars project_id=<> hpc-amd-slurm.yaml +gcluster create --vars project_id=<> hpc-amd-slurm.yaml ``` It will create a directory containing a Terraform module. Follow the printed diff --git a/community/examples/flux-framework/README.md b/community/examples/flux-framework/README.md index c1e2d8271d..aa8b580c24 100644 --- a/community/examples/flux-framework/README.md +++ b/community/examples/flux-framework/README.md @@ -26,15 +26,15 @@ Toolkit guidance to enable [APIs][apis] and establish minimum resource ### Deploy the flux-framework Cluster -Use `ghcp` to provision the blueprint +Use `gcluster` to provision the blueprint ```bash -ghpc create community/examples/flux-framework --vars project_id=<> +gcluster create community/examples/flux-framework --vars project_id=<> ``` This will create a directory containing Terraform modules. -Follow `ghpc` instructions to deploy the cluster +Follow `gcluster` instructions to deploy the cluster ```text terraform -chdir=flux-fw-cluster/primary init diff --git a/community/examples/hpc-slurm6-tpu-maxtext.yaml b/community/examples/hpc-slurm6-tpu-maxtext.yaml index 5e172cd5c2..b8a8121a5d 100644 --- a/community/examples/hpc-slurm6-tpu-maxtext.yaml +++ b/community/examples/hpc-slurm6-tpu-maxtext.yaml @@ -72,7 +72,7 @@ deployment_groups: python3 MaxText/train.py MaxText/configs/base.yml run_name= base_output_directory=${PWD}/output/ dataset_path= async_checkpointing=False attention= steps= - id: tpu_nodeset - source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu use: [network] settings: node_type: v4-8 @@ -88,7 +88,7 @@ deployment_groups: node_count_dynamic_max: 1 - id: tpu_partition - source: ./community/modules/compute/schedmd-slurm-gcp-v6-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [tpu_nodeset] settings: partition_name: tpu @@ -110,14 +110,14 @@ deployment_groups: is_default: true - id: slurm_login - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: [network] settings: enable_login_public_ips: true machine_type: n2-standard-16 - id: slurm_controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - tpu_partition - compute_partition diff --git a/community/examples/hpc-slurm6-tpu.yaml b/community/examples/hpc-slurm6-tpu.yaml index 0f6455884f..606dae3246 100644 --- a/community/examples/hpc-slurm6-tpu.yaml +++ b/community/examples/hpc-slurm6-tpu.yaml @@ -29,7 +29,7 @@ deployment_groups: source: modules/network/vpc - id: tpu_nodeset - source: ./community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu use: [network] settings: node_type: v3-8 @@ -45,20 +45,20 @@ deployment_groups: node_count_dynamic_max: 1 - id: tpu_partition - source: ./community/modules/compute/schedmd-slurm-gcp-v6-partition + source: community/modules/compute/schedmd-slurm-gcp-v6-partition use: [tpu_nodeset] settings: partition_name: tpu - id: slurm_login - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: [network] settings: machine_type: n2-standard-4 enable_login_public_ips: true - id: slurm_controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - tpu_partition - slurm_login diff --git a/community/examples/intel/README.md b/community/examples/intel/README.md index 34305c9215..e83bd27391 100644 --- a/community/examples/intel/README.md +++ b/community/examples/intel/README.md @@ -63,10 +63,10 @@ The Pre-deployment Guide provides instructions for: ### Deploy the DAOS Cluster -After completing the steps in the [Pre-deployment Guide][pre-deployment_guide] use `ghpc` to provision the blueprint +After completing the steps in the [Pre-deployment Guide][pre-deployment_guide] use `gcluster` to provision the blueprint ```text -ghpc create community/examples/intel/pfs-daos.yaml \ +gcluster create community/examples/intel/pfs-daos.yaml \ --vars project_id=<> \ [--backend-config bucket=] ``` @@ -75,10 +75,10 @@ This will create the deployment directory containing Terraform modules and Packer templates. The `--backend-config` option is not required but recommended. It will save the terraform state in a pre-existing [Google Cloud Storage bucket][bucket]. For more information see [Setting up a remote terraform -state][backend]. Use `ghpc deploy` to provision your DAOS storage cluster: +state][backend]. Use `gcluster deploy` to provision your DAOS storage cluster: ```text -ghpc deploy pfs-daos --auto-approve +gcluster deploy pfs-daos --auto-approve ``` [backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state @@ -238,7 +238,7 @@ See the [DFuse (DAOS FUSE)](https://docs.daos.io/v2.4/user/filesystem/?h=dfuse#d Delete the remaining infrastructure ```bash -ghpc destroy pfs-daos --auto-approve +gcluster destroy pfs-daos --auto-approve ``` ## DAOS Server with Slurm cluster @@ -291,10 +291,10 @@ The following available quota is required in the region used by Slurm: ### Deploy the DAOS/Slurm Cluster -Use `ghpc` to provision the blueprint, supplying your project ID +Use `gcluster` to provision the blueprint, supplying your project ID ```text -ghpc create community/examples/intel/hpc-slurm-daos.yaml \ +gcluster create community/examples/intel/hpc-slurm-daos.yaml \ --vars project_id=<> \ [--backend-config bucket=] ``` @@ -304,10 +304,10 @@ templates. The `--backend-config` option is not required but recommended. It will save the terraform state in a pre-existing [Google Cloud Storage bucket][bucket]. For more information see [Setting up a remote terraform state][backend]. -Follow `ghpc` instructions to deploy the environment +Follow `gcluster` instructions to deploy the environment ```text -ghpc deploy hpc-slurm-daos --auto-approve +gcluster deploy hpc-slurm-daos --auto-approve ``` [backend]: ../../../examples/README.md#optional-setting-up-a-remote-terraform-state @@ -450,5 +450,5 @@ have been shutdown and deleted by the Slurm autoscaler. Delete the remaining infrastructure: ```bash -ghpc destroy hpc-slurm-daos --auto-approve +gcluster destroy hpc-slurm-daos --auto-approve ``` diff --git a/community/front-end/ofe/README.md b/community/front-end/ofe/README.md index 6d9a028122..8bcfb54a7d 100644 --- a/community/front-end/ofe/README.md +++ b/community/front-end/ofe/README.md @@ -15,7 +15,7 @@ steps: * Prepare the client side environment and secure sufficient IAM permissions for the system deployment. * When ready, clone this repository and run the deployment script at - `hpc-toolkit/community/front-end/ofe/deploy.sh` from a client machine or a Cloud + `cluster-toolkit/community/front-end/ofe/deploy.sh` from a client machine or a Cloud Shell. Follow instructions to complete the deployment. The whole process is automated via Terraform and should complete within 15 minutes. * Perform post-deployment configurations. diff --git a/community/front-end/ofe/deploy.sh b/community/front-end/ofe/deploy.sh index 7b9fd09c82..a546d9686d 100755 --- a/community/front-end/ofe/deploy.sh +++ b/community/front-end/ofe/deploy.sh @@ -504,7 +504,7 @@ deploy() { # -- Collect deployment files # # For a tarball deployment, it is important that the 'root' directory is - # named 'hpc-toolkit' as most of the install depends on it. + # named 'cluster-toolkit' as most of the install depends on it. # # Simplest way to ensure this is to build from a temporary copy that # definitely is named correctly. @@ -512,7 +512,7 @@ deploy() { if [ "${deployment_mode}" == "tarball" ]; then basedir=$(git rev-parse --show-toplevel) - tdir=/tmp/hpc-toolkit + tdir=/tmp/cluster-toolkit cp -R "${basedir}" ${tdir}/ ( @@ -523,7 +523,7 @@ deploy() { --exclude=.terraform.lock.hcl \ --exclude=tf \ --directory=/tmp \ - ./hpc-toolkit 2>/dev/null + ./cluster-toolkit 2>/dev/null ) rm -rf ${tdir} @@ -562,7 +562,7 @@ TFVARS fi if [ "${deployment_mode}" == "git" ]; then - echo "Will clone hpc-toolkit from github.com/${repo_fork}/hpc-toolkit.git ${repo_branch} branch." + echo "Will clone cluster-toolkit from github.com/${repo_fork}/cluster-toolkit.git ${repo_branch} branch." cat <<-END >>terraform.tfvars repo_fork = "${repo_fork}" diff --git a/community/front-end/ofe/docs/developer_guide.md b/community/front-end/ofe/docs/developer_guide.md index 63abc18a9a..1b6a4202a4 100644 --- a/community/front-end/ofe/docs/developer_guide.md +++ b/community/front-end/ofe/docs/developer_guide.md @@ -148,7 +148,7 @@ The home directory of the *gcluster* account is at `/opt/gcluster`. For a new de #### For cloud resources Run-time data to support creating and managing cloud resources are generated -and stored in the following sub-directories within `hpc-toolkit/frontend` on +and stored in the following sub-directories within `cluster-toolkit/frontend` on the service machine: - `clusters/cluster_\` - holding run-time data for a cluster. `\` here @@ -246,7 +246,7 @@ define the major components: | dir | description | |-----------------------------|-------------| -| `hpc-toolkit/frontend/` | Top level | +| `cluster-toolkit/frontend/` | Top level | | `.../cli/` | client commandline interface | | `.../docs/` | documentation | | `.../infrastructure_files/` | Support files for deploying cloud infrastructure | @@ -344,7 +344,7 @@ not currently support Vertex AI Workbenches. ### Infrastructure files Workbenches are created using a template configuration in -`hpc-toolkit/frontend/infrastructure_files/workbench_tf`. The Terraform +`cluster-toolkit/frontend/infrastructure_files/workbench_tf`. The Terraform template was originally based on the Terraform template provided by the [Google Cloud Platform Rad-Lab git repo](https://github.com/GoogleCloudPlatform/rad-lab) however the configuration diverged during early development. The main reason @@ -353,11 +353,11 @@ specific OSLogin user rather than the generic Jupyter user which would make it impossible to interact properly with any mounted shared storage. The process of creating the workbench files is mostly contained within the file -`hpc-toolkit/frontend/website/ghpcfe/cluster_manager/workbenchinfo.py`. The +`cluster-toolkit/frontend/website/ghpcfe/cluster_manager/workbenchinfo.py`. The `copy_terraform()` routine copies files from the `infrastructure_files` directory while the `prepare_terraform_vars()` routine creates a `terraform.tfvars` file within the -`hpc-toolkit/frontend/workbenches/workbench_##` directory to provide the +`cluster-toolkit/frontend/workbenches/workbench_##` directory to provide the following info gathered by the FrontEnd during the workbench creation process: - region diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py index 2a4a144e28..53140cc50d 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/files/ghpcfe_c2daemon.py @@ -243,12 +243,17 @@ def _slurm_get_job_info(jobid): def _slurm_get_job_state(jobid): - """Returns the job state, or None if job isn't in the queue""" - # N.B - eventually, pyslurm might work with our version of Slurm, - # and this can be changed to something more sane. For now, call squeue - state = _slurm_get_job_info(jobid) - return state.get("job_state", None) if state else None + """Returns the job state, or None if the job isn't in the queue""" + state = _slurm_get_job_info(jobid) # Fetch job info using an external function + job_state = state.get("job_state", None) if state else None # Get the 'job_state' if available + + if job_state and isinstance(job_state, list) and job_state: + logger.info("Slurm returned job %s with state %s", jobid, job_state[0]) # Log the first state if available + return job_state[0] # Return the first element of the state list + else: + logger.info("No valid job state available for job %s", jobid) # Log when no valid state is found + return None # Return None if there is no job state or it's not a list def _spack_submit_build(app_id, partition, app_name, spec, extra_sbatch=None): build_dir = Path("/opt/cluster/installs") / str(app_id) @@ -925,12 +930,14 @@ def cb_run_job(message, **kwargs): try: slurm_job_info = _slurm_get_job_info(slurm_jobid) response["job_runtime"] = ( - slurm_job_info["end_time"] - slurm_job_info["start_time"] + slurm_job_info["end_time"]["number"] - slurm_job_info["start_time"]["number"] ) except KeyError: logger.warning( "Job data from SLURM did not include start time and end time" ) + except Exception as E: + logger.error("Unexpected error: %s", E) kpi = job_dir / "kpi.json" if kpi.is_file(): diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml index 6ffe2cf2ae..8a5d8d4724 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml @@ -13,37 +13,30 @@ # limitations under the License. --- -- name: Set most recent Python version as default - ansible.builtin.shell: - cmd: | - latest_version=$(ls -1 /usr/bin/python3* | awk -F/ '{print $NF}' | grep -E 'python[0-9]+\.[0-9]+$' | sort -V | tail -1) - alternatives --set python3 /usr/bin/$latest_version - when: ansible_distribution == 'Rocky' +- name: Get default Python interpreter from update-alternatives + ansible.builtin.shell: > + update-alternatives --display python3 | + grep 'link currently points to' | + awk '{print $NF}' + register: default_python + changed_when: false -- name: Install pip3 - ansible.builtin.package: - name: python3-pip - state: present - become: true - when: ansible_distribution == 'Rocky' +- name: Set default Python interpreter for Ansible + ansible.builtin.set_fact: + ansible_python_interpreter: "{{ default_python.stdout }}" -- name: Install setuptools for Python 3.11 - ansible.builtin.command: - cmd: /usr/bin/python3.11 -m ensurepip --upgrade - become: true - when: ansible_distribution == 'Rocky' +- name: Verify Python interpreter + ansible.builtin.command: "{{ ansible_python_interpreter }} --version" + register: python_version -- name: Upgrade PIP3 - ansible.builtin.pip: - executable: pip3 - name: pip - state: forcereinstall +- name: Display Python version + ansible.builtin.debug: + msg: "The Python interpreter version is: {{ python_version.stdout }}" # Can't use the pip action here because we need to explicitly enable # a modern gcc from the dev_env role - name: Install FE C&C Dependencies ansible.builtin.pip: - executable: pip3 name: - requests - pexpect diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index ecce1e3a32..dd92f7641f 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -35,7 +35,7 @@ deploy_mode=$(curl --silent --show-error http://metadata/computeMetadata/v1/inst # Exit if deployment already exists to stop startup script running on reboots # -if [[ -d /opt/gcluster/hpc-toolkit ]]; then +if [[ -d /opt/gcluster/cluster-toolkit ]]; then printf "It appears gcluster has already been deployed. Exiting...\n" exit 0 fi @@ -48,9 +48,10 @@ printf "####################\n#### Installing required packages\n############### dnf install -y epel-release dnf update -y --security dnf config-manager --add-repo https://rpm.releases.hashicorp.com/RHEL/hashicorp.repo +dnf install -y terraform-1.4.6 dnf install --best -y google-cloud-sdk nano make gcc python38-devel unzip git \ rsync wget nginx bind-utils policycoreutils-python-utils \ - terraform packer supervisor python3-certbot-nginx jq + packer supervisor python3-certbot-nginx jq curl --silent --show-error --location https://github.com/mikefarah/yq/releases/download/v4.13.4/yq_linux_amd64 --output /usr/local/bin/yq chmod +x /usr/local/bin/yq curl --silent --show-error --location https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz --output /tmp/shellcheck.tar.xz @@ -75,7 +76,7 @@ EOL dnf install -y grafana -# Packages for https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_enable_cleanup_compute +# Packages for https://github.com/GoogleCloudPlatform/cluster-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_enable_cleanup_compute pip3.8 install google-api-python-client \ google-cloud-secret-manager \ google.cloud.pubsub \ @@ -136,7 +137,7 @@ fi useradd -r -m -d /opt/gcluster gcluster if [ "${deploy_mode}" == "git" ]; then - fetch_hpc_toolkit="git clone -b \"${repo_branch}\" https://github.com/${repo_fork}/hpc-toolkit.git" + fetch_hpc_toolkit="git clone -b \"${repo_branch}\" https://github.com/${repo_fork}/cluster-toolkit.git" elif [ "${deploy_mode}" == "tarball" ]; then printf "\n####################\n#### Download web application files\n####################\n" @@ -159,8 +160,8 @@ EOF # Install go version specified in go.mod file # # Note: go.mod doesn't reference minor version so we need to capture the latest -GO_MAJOR_VERSION=$(awk '/^go/ {print $2}' "/opt/gcluster/hpc-toolkit/go.mod") -GO_API_RESPONSE=$(curl --silent "https://go.dev/dl/?mode=json") +GO_MAJOR_VERSION=$(awk '/^go/ {print $2}' "/opt/gcluster/cluster-toolkit/go.mod") +GO_API_RESPONSE=$(curl --silent "https://go.dev/dl/?mode=json&include=all") GO_VERSION=$(echo "$GO_API_RESPONSE" | jq -r --arg major "go$GO_MAJOR_VERSION" '.[] | select(.version | startswith($major)).version' | sort -V | tail -n 1) GO_DOWNLOAD_URL="https://golang.org/dl/${GO_VERSION}.linux-amd64.tar.gz" curl --silent --show-error --location "${GO_DOWNLOAD_URL}" --output "/tmp/${GO_VERSION}.linux-amd64.tar.gz" @@ -171,7 +172,7 @@ rm -rf /usr/local/go && tar -C /usr/local -xzf "/tmp/${GO_VERSION}.linux-amd64.t echo 'export PATH=$PATH:/usr/local/go/bin:~/go/bin' >>/etc/bashrc sudo su - gcluster -c /bin/bash < configuration.yaml @@ -243,7 +244,7 @@ EOL printf "Creating supervisord service..." echo "[program:gcluster-uvicorn-background] process_name=%(program_name)s_%(process_num)02d -directory=/opt/gcluster/hpc-toolkit/community/front-end/ofe/website +directory=/opt/gcluster/cluster-toolkit/community/front-end/ofe/website command=/opt/gcluster/django-env/bin/uvicorn website.asgi:application --reload --host 127.0.0.1 --port 8001 autostart=true autorestart=true @@ -261,8 +262,8 @@ After=supervisord.service grafana-server.service [Service] Type=forking -ExecStart=/usr/sbin/nginx -p /opt/gcluster/run/ -c /opt/gcluster/hpc-toolkit/community/front-end/ofe/website/nginx.conf -ExecStop=/usr/sbin/nginx -p /opt/gcluster/run/ -c /opt/gcluster/hpc-toolkit/community/front-end/ofe/website/nginx.conf -s stop +ExecStart=/usr/sbin/nginx -p /opt/gcluster/run/ -c /opt/gcluster/cluster-toolkit/community/front-end/ofe/website/nginx.conf +ExecStop=/usr/sbin/nginx -p /opt/gcluster/run/ -c /opt/gcluster/cluster-toolkit/community/front-end/ofe/website/nginx.conf -s stop PIDFile=/opt/gcluster/run/nginx.pid Restart=no @@ -280,7 +281,7 @@ systemctl status gcluster.service # sudo su - gcluster -c /bin/bash <>"${tmpcron}" # .. if something more forceful/complete is needed: - # echo "0 12 * * * /usr/bin/certbot certonly --force-renew --quiet" --nginx --nginx-server-root=/opt/gcluster/hpc-toolkit/community/front-end/ofe/website --cert-name "${SERVER_HOSTNAME}" -m "${DJANGO_EMAIL}" >>"${tmpcron}" + # echo "0 12 * * * /usr/bin/certbot certonly --force-renew --quiet" --nginx --nginx-server-root=/opt/gcluster/cluster-toolkit/community/front-end/ofe/website --cert-name "${SERVER_HOSTNAME}" -m "${DJANGO_EMAIL}" >>"${tmpcron}" crontab -u root "${tmpcron}" rm "${tmpcron}" diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index f2764643c5..26756d670c 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -12,7 +12,7 @@ cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 click==8.1.3 -cryptography==42.0.4 +cryptography==43.0.1 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.6 diff --git a/community/front-end/ofe/tf/README.md b/community/front-end/ofe/tf/README.md index faa58140ae..212979bfc1 100644 --- a/community/front-end/ofe/tf/README.md +++ b/community/front-end/ofe/tf/README.md @@ -61,7 +61,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | GCP Project in which to deploy the HPC Frontend. | `string` | n/a | yes | | [region](#input\_region) | GCP Region for HPC Frontend deployment. | `string` | n/a | yes | | [repo\_branch](#input\_repo\_branch) | git branch to checkout when deploying the HPC Frontend | `string` | `"main"` | no | -| [repo\_fork](#input\_repo\_fork) | GitHub repository name in which to find the hpc-toolkit repo | `string` | `"GoogleCloudPlatform"` | no | +| [repo\_fork](#input\_repo\_fork) | GitHub repository name in which to find the cluster-toolkit repo | `string` | `"GoogleCloudPlatform"` | no | | [server\_instance\_type](#input\_server\_instance\_type) | Instance size to use from HPC Frontend webserver | `string` | `"e2-standard-2"` | no | | [static\_ip](#input\_static\_ip) | Optional pre-configured static IP for HPC Frontend. | `string` | `""` | no | | [subnet](#input\_subnet) | Subnet in which to deploy HPC Frontend. | `string` | `""` | no | diff --git a/community/front-end/ofe/tf/variables.tf b/community/front-end/ofe/tf/variables.tf index fec65fd059..be06e2578a 100644 --- a/community/front-end/ofe/tf/variables.tf +++ b/community/front-end/ofe/tf/variables.tf @@ -94,7 +94,7 @@ variable "repo_branch" { variable "repo_fork" { default = "GoogleCloudPlatform" type = string - description = "GitHub repository name in which to find the hpc-toolkit repo" + description = "GitHub repository name in which to find the cluster-toolkit repo" } variable "deployment_key" { diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py index b5708ef1f4..c6b8427394 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/cloud_info.py @@ -83,6 +83,56 @@ def _get_gcp_client(credentials, service="compute", api_version="v1"): ) +def _get_vm_reservations(credentials, zone, ttl_hash=None): + try: + # logger.info(f"Fetching VM reservations for credentials: {credentials}, zone: {zone}") + project, client = _get_gcp_client(credentials) + + req = client.reservations().list(project=project, zone=zone) + resp = req.execute() + + if "items" not in resp: + # logger.info("No reservations found") + return {} + + data = { + reservation["name"]: { + "name": reservation["name"], + "specificReservationRequired": reservation.get("specificReservationRequired", False), + "status": reservation["status"], + "instanceProperties": { + "machineType": reservation + .get("specificReservation", {}) + .get("instanceProperties", {}) + .get("machineType", ""), + "minCpuPlatform": reservation + .get("specificReservation", {}) + .get("instanceProperties", {}) + .get("minCpuPlatform", ""), + "availableCount": int( + reservation + .get("specificReservation", {}) + .get("count", 0) + ) + }, + "shareSettings": reservation.get("shareSettings", {}), + } + for reservation in resp["items"] + } + + # logger.info(f"Reservations data: {data}") + return data + except Exception as e: + logger.error(f"Error fetching VM reservations: {e}") + return {} + +def get_vm_reservations(cloud_provider, credentials, unused_region, zone): + if cloud_provider == "GCP": + return _get_vm_reservations(credentials, zone, ttl_hash=_get_ttl_hash()) + else: + raise Exception(f'Unsupported Cloud Provider "{cloud_provider}"') + + @lru_cache def _get_gcp_disk_types( credentials, zone, ttl_hash=None @@ -116,6 +166,10 @@ def _get_gcp_machine_types( ): # pylint: disable=unused-argument (project, client) = _get_gcp_client(credentials) + # Fetch disk types dynamically + disk_types = _get_gcp_disk_types(credentials, zone, ttl_hash=ttl_hash) + disk_type_names = [disk_type["name"] for disk_type in disk_types] + req = client.machineTypes().list( project=project, zone=zone, filter="isSharedCpu=False" ) @@ -124,6 +178,98 @@ def _get_gcp_machine_types( if "items" not in resp: return [] + invalid_disk_types = { + "c4-": [ + "local-ssd", "pd-standard", "pd-balanced", "pd-ssd", + "pd-extreme", "hyperdisk-ml", "hyperdisk-throughput" + ], + "c3-": [ + "pd-extreme", "pd-standard" + ], + "c3d-": [ + "pd-standard", "pd-extreme", "hyperdisk-extreme" + ], + "n4-": [ + "local-ssd", "pd-standard", "pd-balanced", "pd-ssd", + "pd-extreme", "hyperdisk-extreme", "hyperdisk-ml", + "hyperdisk-throughput" + ], + "n2-": [ + "hyperdisk-balanced", "hyperdisk-ml" + ], + "n2d-": [ + "pd-extreme", "hyperdisk-ml", "hyperdisk-balanced", + "hyperdisk-extreme" + ], + "n1-": [ + "pd-extreme", "hyperdisk-extreme", "hyperdisk-ml", + "hyperdisk-throughput", "hyperdisk-balanced" + ], + "t2d-": [ + "pd-extreme", "local-ssd", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme" + ], + "t2a-": [ + "local-ssd", "pd-extreme", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme", + "hyperdisk-throughput" + ], + "e2-": [ + "local-ssd", "pd-extreme", "hyperdisk-balanced", + "hyperdisk-ml", "hyperdisk-extreme", + "hyperdisk-throughput" + ], + "z3-": [ + "pd-extreme", "pd-standard", "hyperdisk-balanced", + "hyperdisk-ml" + ], + "h3-": [ + "local-ssd", "pd-standard", "pd-ssd", "pd-extreme", + "hyperdisk-ml", "hyperdisk-extreme" + ], + "c2-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-extreme", + "hyperdisk-ml", "hyperdisk-throughput" + ], + "c2d-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-extreme", + "hyperdisk-ml", "hyperdisk-throughput" + ], + "x4-": [ + "local-ssd", "pd-ssd", "pd-standard", "pd-balanced", + "pd-extreme", "hyperdisk-ml", "hyperdisk-throughput" + ], + "m3-": [ + "hyperdisk-throughput", "hyperdisk-ml", "pd-standard" + ], + "m2-": [ + "local-ssd", "hyperdisk-ml", "hyperdisk-throughput" + ], + "m1-": [ + "hyperdisk-ml", "hyperdisk-throughput" + ], + "n1-": [ + "pd-extreme", "hyperdisk-balanced", "hyperdisk-ml", + "hyperdisk-extreme", "hyperdisk-throughput" + ], + "a3-": [ + "pd-extreme", "pd-standard", + "hyperdisk-balanced" + ], + "a2-": [ + "pd-extreme", "hyperdisk-throughput", + "hyperdisk-balanced", "hyperdisk-extreme" + ], + "g2-": [ + "pd-extreme", "pd-standard", "hyperdisk-balanced", + "hyperdisk-extreme" + ] + } + + def get_invalid_disk_types(machine_type_name): + family = machine_type_name.split("-")[0] + "-" + return invalid_disk_types.get(family, []) + data = { mt["name"]: { "name": mt["name"], @@ -138,6 +284,7 @@ def _get_gcp_machine_types( } for acc in mt.get("accelerators", []) }, + "invalid_disk_types": get_invalid_disk_types(mt["name"]) } for mt in resp["items"] } @@ -174,6 +321,8 @@ def _get_gcp_machine_types( items[0]["description"] ) + # logger.info(data) + return data diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index 516d791701..e01acf1602 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -55,7 +55,7 @@ class ClusterInfo: def __init__(self, cluster): self.config = utils.load_config() - self.ghpc_path = "/opt/gcluster/hpc-toolkit/ghpc" + self.ghpc_path = "/opt/gcluster/cluster-toolkit/ghpc" self.cluster = cluster self.cluster_dir = ( @@ -164,7 +164,7 @@ def _set_credentials(self, creds=None): def _create_ssh_key(self, target_dir): # ssh-keygen -t rsa -f /.ssh/id_rsa -N "" sshdir = target_dir / ".ssh" - + if not sshdir.exists(): sshdir.mkdir(mode=0o711) @@ -196,7 +196,7 @@ def _prepare_ghpc_filesystems(self): filesystems_yaml = [] refs = [] template = self.env.get_template('blueprint/filesystem_config.yaml.j2') - + for (count, mp) in enumerate(self.cluster.mount_points.order_by("mount_order")): storage_id = f"mount_num_{mp.id}" server_ip = "'$controller'" if mp.export in self.cluster.shared_fs.exports.all() else mp.export.server_name @@ -212,7 +212,7 @@ def _prepare_ghpc_filesystems(self): indented_yaml = self.indent_text(rendered_yaml, 1) # Indent as necessary... filesystems_yaml.append(indented_yaml) refs.append(context['storage_id']) - + return ("\n\n".join(filesystems_yaml), refs) def _prepare_ghpc_partitions(self, part_uses): @@ -254,7 +254,7 @@ def _prepare_cloudsql_yaml(self): def _yaml_refs_to_uses(self, use_list, indent_level=0): indent = ' ' * indent_level use_lines = [f"{indent}- {item}" for item in use_list] - return "\n".join(use_lines) + return "\n".join(use_lines) def _prepare_ghpc_yaml(self): try: @@ -281,6 +281,18 @@ def _prepare_ghpc_yaml(self): } rendered_yaml = template.render(context) + if self.cluster.controller_node_image is not None: + context["controller_image_yaml"] = f"""instance_image: + family: image-{self.cluster.controller_node_image.family} + project: {self.cluster.project_id} + """ + + if self.cluster.login_node_image is not None: + context["login_image_yaml"] = f"""instance_image: + family: image-{self.cluster.login_node_image.family} + project: {self.cluster.project_id} + """ + with yaml_file.open("w") as f: f.write(rendered_yaml) @@ -368,6 +380,8 @@ def _get_tf_state_resource(self, state, filters): Returns each match """ + print(state["resources"]) + print(filters) def matches(x): try: @@ -381,6 +395,7 @@ def matches(x): return list(filter(matches, state["resources"])) def _create_model_instances_from_tf_state(self, state, filters): + print(self._get_tf_state_resource(state, filters)) tf_nodes = self._get_tf_state_resource(state, filters)[0]["instances"] def model_from_tf(tf): @@ -423,7 +438,7 @@ def model_from_tf(tf): return existing_instance # Return the existing instance except ComputeInstance.DoesNotExist: # If the instance doesn't exist, create a new one - return ComputeInstance(**ci_kwargs) + return ComputeInstance(**ci_kwargs) return [model_from_tf(instance) for instance in tf_nodes] @@ -434,14 +449,14 @@ def _get_service_accounts(self, tf_state): # controller & login until we start setting them. filters = { - "module": "module.slurm_controller.module.slurm_controller_instance.module.slurm_controller_instance", #pylint:disable=line-too-long + "module": "module.slurm_controller.module.slurm_controller_instance", #pylint:disable=line-too-long "name": "slurm_instance", } tf_node = self._get_tf_state_resource(tf_state, filters)[0]["instances"][0] #pylint:disable=line-too-long ctrl_sa = tf_node["attributes"]["service_account"][0]["email"] filters = { - "module": "module.slurm_login.module.slurm_login_instance.module.slurm_login_instance", #pylint:disable=line-too-long + "module": 'module.slurm_controller.module.slurm_login_instance["slurm-login"]', #pylint:disable=line-too-long "name": "slurm_instance", } tf_node = self._get_tf_state_resource(tf_state, filters)[0]["instances"][0] #pylint:disable=line-too-long @@ -518,7 +533,7 @@ def _apply_terraform(self): mgmt_nodes = self._create_model_instances_from_tf_state( state, { - "module": "module.slurm_controller.module.slurm_controller_instance.module.slurm_controller_instance", # pylint: disable=line-too-long + "module": "module.slurm_controller.module.slurm_controller_instance", # pylint: disable=line-too-long "name": "slurm_instance", }, ) @@ -539,7 +554,7 @@ def _apply_terraform(self): login_nodes = self._create_model_instances_from_tf_state( state, { - "module": "module.slurm_login.module.slurm_login_instance.module.slurm_login_instance", # pylint: disable=line-too-long + "module": 'module.slurm_controller.module.slurm_login_instance["slurm-login"]', # pylint: disable=line-too-long "name": "slurm_instance", }, ) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py index f735107123..aaf460c4c4 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/filesystem.py @@ -90,7 +90,7 @@ def create_filesystem(fs: Filesystem) -> None: def _run_ghpc(target_dir: Path, cred_env: dict) -> None: - ghpc_path = "/opt/gcluster/hpc-toolkit/ghpc" + ghpc_path = "/opt/gcluster/cluster-toolkit/ghpc" try: logger.info("Invoking ghpc create") diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py index 363029db9b..22b2f3f030 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py @@ -33,7 +33,7 @@ class ImageBackend: def __init__(self, image): self.config = utils.load_config() - self.ghpc_path = "/opt/gcluster/hpc-toolkit/ghpc" + self.ghpc_path = "/opt/gcluster/cluster-toolkit/ghpc" self.image = image self.image_dir = ( @@ -51,7 +51,7 @@ def prepare(self): 1. Create the necessary directory structure for the image. 2. Generate a Cluster Toolkit blueprint to build the image. - 3. Run the Cluster Toolkit (`ghpc`) to create the image based on the blueprint. + 3. Run the Cluster Toolkit (`gcluster`) to create the image based on the blueprint. 4. Set up the builder environment on Google Cloud Platform (GCP) using Terraform. 5. Create the image on GCP using Packer. 6. Destroy the builder environment after the image creation is complete. @@ -71,7 +71,7 @@ def prepare(self): OSError: If there is an error while creating the image directory or writing to the credentials file. IOError: If there is an error while writing to the credentials file. - subprocess.CalledProcessError: If any of the subprocess calls (ghpc, Terraform, or Packer) + subprocess.CalledProcessError: If any of the subprocess calls (gcluster, Terraform, or Packer) encounter an error during execution. """ self._create_image_dir() @@ -172,7 +172,7 @@ def _create_blueprint(self): def _run_ghpc(self): target_dir = self.image_dir try: - logger.info(f"Invoking ghpc create for the image {self.image.id}") + logger.info(f"Invoking gcluster create for the image {self.image.id}") log_out_fn = target_dir / "ghpc_create_log.stdout" log_err_fn = target_dir / "ghpc_create_log.stderr" @@ -191,7 +191,7 @@ def _run_ghpc(self): ) except subprocess.CalledProcessError as cpe: self.update_image_status("e") - logger.error(f"ghpc exec failed for image {self.image.id}", exc_info=cpe) + logger.error(f"gcluster exec failed for image {self.image.id}", exc_info=cpe) # No logs from stdout/err - get dumped to files raise diff --git a/community/front-end/ofe/website/ghpcfe/forms.py b/community/front-end/ofe/website/ghpcfe/forms.py index d6db9c4618..dc4d7046d0 100644 --- a/community/front-end/ofe/website/ghpcfe/forms.py +++ b/community/front-end/ofe/website/ghpcfe/forms.py @@ -248,6 +248,7 @@ class Meta: "dynamic_node_count", "static_node_count", "reservation_name", + "exclusive", "enable_placement", "enable_hyperthreads", "enable_tier1_networking", @@ -316,6 +317,94 @@ def clean(self): raise ValidationError( "SlurmGCP does not support Placement Groups for selected instance type" # pylint: disable=line-too-long ) + + # schedmd-slurm-gcp-v6-partition/outputs.tf + if cleaned_data["dynamic_node_count"] > 0 and not cleaned_data[ + "exclusive" + ]: + raise ValidationError( + "If any non-static nodesets have enable placement set to true, exclusive must be true." + ) + + if cleaned_data["static_node_count"] > 0 and cleaned_data[ + "exclusive" + ]: + raise ValidationError( + "Can't use static nodes within partition with exclusive set to true." + ) + + # schedmd-slurm-gcp-v6-nodeset/outputs.tf + if cleaned_data["reservation_name"] and cleaned_data[ + "enable_placement" + ]: + raise ValidationError("If a reservation is specified, placement must be false.") + + if cleaned_data["enable_placement"] and cleaned_data[ + "static_node_count" + ] > 0 and cleaned_data[ + "dynamic_node_count" + ] > 0: + raise ValidationError( + "Cannot use placement with static and auto-scaling nodes in the same node set." + ) + + # Reservation validation logic + reservation_name = cleaned_data.get("reservation_name") + if reservation_name: + try: + cluster = cleaned_data.get('cluster') + cloud_credential = cluster.cloud_credential.detail + cloud_zone = cluster.cloud_zone + + # logger.info(f"Cluster: {cluster}") + # logger.info(f"Cloud Credential: {cloud_credential}") + # logger.info(f"Cloud Zone: {cloud_zone}") + + reservations = cloud_info.get_vm_reservations("GCP", cloud_credential, None, cloud_zone) + + if not reservations: + raise ValidationError("No reservations found for the specified zone.") + + matching_reservation = reservations.get(reservation_name) + + if not matching_reservation: + raise ValidationError( + f"Reservation {reservation_name} does not exist in the specified zone." + ) + + if matching_reservation[ + "instanceProperties" + ][ + "machineType" + ] != cleaned_data["machine_type"]: + raise ValidationError( + f"Reservation {reservation_name} does not support the specified machine type. " + f"Machine type: {cleaned_data['machine_type']}." + ) + + total_requested_nodes = cleaned_data["dynamic_node_count"] + cleaned_data["static_node_count"] + available_nodes = matching_reservation.get("instanceProperties", {}).get("availableCount", 0) + + if total_requested_nodes > available_nodes: + raise ValidationError( + f"Reservation {reservation_name} does not have enough available nodes." + f"Requested: {total_requested_nodes}, Available: {available_nodes}" + ) + + specific_reservation = matching_reservation.get("specificReservationRequired") + if specific_reservation == False: + raise ValidationError( + f"You must use a 'specific' reservation type." + f"Please read the following URL for more information about setting up reservations:" + f"https://cloud.google.com/compute/docs/instances/reservations-overview#how-reservations-work" + ) + + except Exception as e: + logger.error(f"Error validating reservation: {reservation_name}. Exception: {e}") + raise ValidationError( + f"Error validating reservation: {reservation_name}. Exception: {str(e)}" + ) + return cleaned_data diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index 12ea18ff62..075f4587f5 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -604,7 +604,7 @@ class Image(CloudResource): max_length=60, help_text="Enter a source image family", blank=False, - default="schedmd-v5-slurm-22-05-8-rocky-linux-8", + default="slurm-gcp-6-5-hpc-rocky-linux-8", ) startup_script = models.ManyToManyField( @@ -919,6 +919,9 @@ class ClusterPartition(models.Model): enable_hyperthreads = models.BooleanField( default=False, help_text="Enable Hyperthreads (SMT)" ) + exclusive = models.BooleanField( + default=True, help_text="Exclusive job access to nodes." + ) enable_tier1_networking = models.BooleanField( default=False, help_text=( diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 index a22569d024..ac33085f4e 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/cluster_config.yaml.j2 @@ -5,9 +5,7 @@ vars: deployment_name: {{ cluster.cloud_id }} region: {{ cluster.cloud_region }} zone: {{ cluster.cloud_zone }} - enable_reconfigure: True - enable_cleanup_compute: False - enable_cleanup_subscriptions: True + enable_cleanup_compute: True enable_bigquery_load: {{ cluster.use_bigquery }} instance_image_custom: True labels: @@ -47,7 +45,7 @@ deployment_groups: {{ cloudsql_yaml | safe }} - - source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + - source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller kind: terraform id: slurm_controller settings: @@ -61,9 +59,8 @@ deployment_groups: disk_type: {{ cluster.controller_disk_type }} disk_size_gb: {{ cluster.controller_disk_size }} {{ controller_image_yaml | safe }} - service_account: - email: $(hpc_service_account.service_account_email) - scopes: + service_account_email: $(hpc_service_account.service_account_email) + service_account_scopes: - https://www.googleapis.com/auth/cloud-platform - https://www.googleapis.com/auth/monitoring.write - https://www.googleapis.com/auth/logging.write @@ -74,31 +71,30 @@ deployment_groups: echo "******************************************** CALLING CONTROLLER STARTUP" gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_controller.sh - | bash compute_startup_script: | + echo "******************************************** CALLING COMPUTE STARTUP" #!/bin/bash gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_compute.sh - | bash + login_startup_script: | + #!/bin/bash + echo "******************************************** CALLING LOGIN STARTUP" + gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_login.sh - | bash use: + - slurm_login {{ controller_uses | safe }} - - source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + - source: community/modules/scheduler/schedmd-slurm-gcp-v6-login kind: terraform id: slurm_login settings: num_instances: {{ cluster.num_login_nodes }} - subnetwork_self_link: {{ cluster.subnet.cloud_id }} + subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.cloud_region }}/subnetworks/{{ cluster.subnet.cloud_id }}" machine_type: {{ cluster.login_node_instance_type }} disk_type: {{ cluster.login_node_disk_type }} disk_size_gb: {{ cluster.login_node_disk_size }} {{ login_image_yaml | safe }} - service_account: - email: $(hpc_service_account.service_account_email) - scopes: + service_account_email: $(hpc_service_account.service_account_email) + service_account_scopes: - https://www.googleapis.com/auth/cloud-platform - https://www.googleapis.com/auth/monitoring.write - https://www.googleapis.com/auth/logging.write - https://www.googleapis.com/auth/devstorage.read_write - startup_script: | - #!/bin/bash - echo "******************************************** CALLING LOGIN STARTUP" - gsutil cp gs://{{ startup_bucket }}/clusters/{{ cluster.id }}/bootstrap_login.sh - | bash - use: - - slurm_controller diff --git a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 index 86ade8151c..9951079cf2 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 +++ b/community/front-end/ofe/website/ghpcfe/templates/blueprint/partition_config.yaml.j2 @@ -1,24 +1,25 @@ -- source: community/modules/compute/schedmd-slurm-gcp-v5-partition +- source: community/modules/compute/schedmd-slurm-gcp-v6-partition kind: terraform id: {{ part_id }} use: - - {{ part_id }}-group -{{ uses_str }} + - {{ part_id }}-nodeset settings: partition_name: {{ part.name }} - subnetwork_self_link: {{ cluster.subnet.cloud_id }} - enable_placement: {{ part.enable_placement }} - exclusive: {{ exclusive }} + exclusive: {{ part.exclusive }} + resume_timeout: 500 -- source: community/modules/compute/schedmd-slurm-gcp-v5-node-group - id: {{ part_id }}-group +- source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + id: {{ part_id }}-nodeset use: +{{ uses_str }} settings: bandwidth_tier: {% if part.enable_tier1_networking %}tier_1_enabled{% else %}platform_default{% endif %} + subnetwork_self_link: "projects/{{ cluster.project_id }}/regions/{{ cluster.cloud_region }}/subnetworks/{{ cluster.subnet.cloud_id }}" enable_smt: {{ part.enable_hyperthreads }} + enable_placement: {{ part.enable_placement }} machine_type: {{ part.machine_type }} {% if part.reservation_name %} - reservation_name: {{ part.reservation_name }} + reservation_name: "projects/{{ cluster.project_id }}/reservations/{{ part.reservation_name }}" {% endif %} node_count_dynamic_max: {{ part.dynamic_node_count }} node_count_static: {{ part.static_node_count }} diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html index 8423fbc3ee..5d4cf61919 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/update_form.html @@ -100,7 +100,7 @@

{{ title }}

Image - + Controller
@@ -132,7 +132,7 @@

{{ title }}

- + Login Nodes
@@ -443,7 +443,7 @@

{{ title }}

region = cloudRegionInput.options[cloudRegionInput.selectedIndex].text; } else { zone = cloudZoneInput.value; - region = cloudZoneInput.value; + region = cloudRegionInput.value; } $.ajax({ @@ -452,40 +452,73 @@

{{ title }}

dataType: "json", headers: { 'X-CSRFToken': $.cookie("csrftoken") } }).done(function (data) { - $(".part_formset_row").each(function () { + $(".part_formset_row, .login_row, .controller_row").each(function () { var formRow = $(this); var machineTypeSelect = formRow.find('.machine_type_select'); var machineType = machineTypeSelect.val(); + + var invalidDiskTypes = []; + + // Fetch the invalid disk types for the selected machine type + $.ajax({ + url: "{% url 'api-instancetype-list' %}" + machineType + "/?cluster={{ object.id }}®ion=" + region + "&zone=" + zone, + type: "GET", + dataType: "json", + async: false, // To ensure we get the data before proceeding + headers: { 'X-CSRFToken': $.cookie("csrftoken") } + }).done(function(machineData) { + invalidDiskTypes = machineData.invalid_disk_types || []; + }); + + var invalidDiskTypes = []; + + // Fetch the invalid disk types for the selected machine type + $.ajax({ + url: "{% url 'api-instancetype-list' %}" + machineType + "/?cluster={{ object.id }}®ion=" + region + "&zone=" + zone, + type: "GET", + dataType: "json", + async: false, // To ensure we get the data before proceeding + headers: { 'X-CSRFToken': $.cookie("csrftoken") } + }).done(function(machineData) { + invalidDiskTypes = machineData.invalid_disk_types || []; + }); formRow.find(".disk_type_select").each(function (pos, selObj) { - var curVal = selObj.value; - $(selObj).empty(); - - if (machineType && machineType.startsWith('c4-')) { - var option = document.createElement("option"); - option.text = "Hyperdisk Balanced Persistent Disk"; - option.setAttribute("value", "hyperdisk-balanced"); - selObj.appendChild(option); - } else { - var additionalDisk = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); - if (additionalDisk.endsWith("additional")) { - $.each(data.disks, function (i, disk_info) { - var option = document.createElement("option"); - option.text = disk_info.description; - option.setAttribute("value", disk_info.name); - selObj.appendChild(option); - }); - } else { - $.each(data.disks, function (i, disk_info) { - if (disk_info.name === 'local-ssd' || disk_info.name.startsWith("pd-")) { - var option = document.createElement("option"); - option.text = disk_info.description; - option.setAttribute("value", disk_info.name); - selObj.appendChild(option); - } - }); - } - } + var curVal = selObj.value; + $(selObj).empty(); + + if (machineType && + (machineType.startsWith('c4-') || + machineType.startsWith('n4-') || + machineType.startsWith('x4-'))) { + var option = document.createElement("option"); + option.text = "Hyperdisk Balanced Persistent Disk"; + option.setAttribute("value", "hyperdisk-balanced"); + selObj.appendChild(option); + } else { + var additionalDisk = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); + if (additionalDisk.endsWith("additional")) { + $.each(data.disks, function (i, disk_info) { + if (invalidDiskTypes.indexOf(disk_info.name) === -1) { + var option = document.createElement("option"); + option.text = disk_info.description; + option.setAttribute("value", disk_info.name); + selObj.appendChild(option); + } + }); + } else { + $.each(data.disks, function (i, disk_info) { + if ((disk_info.name === 'local-ssd' || + disk_info.name.startsWith("pd-")) && + invalidDiskTypes.indexOf(disk_info.name) === -1) { + var option = document.createElement("option"); + option.text = disk_info.description; + option.setAttribute("value", disk_info.name); + selObj.appendChild(option); + } + }); + } + } var id_prefix = selObj.id.slice(0, selObj.id.lastIndexOf("_disk_type")); var disk_size_sel = $(selObj).parentsUntil("tbody").find("#" + id_prefix + "_disk_size")[0]; diff --git a/community/front-end/ofe/website/ghpcfe/views/clusters.py b/community/front-end/ofe/website/ghpcfe/views/clusters.py index 733fade339..eac57a8139 100644 --- a/community/front-end/ofe/website/ghpcfe/views/clusters.py +++ b/community/front-end/ofe/website/ghpcfe/views/clusters.py @@ -468,6 +468,7 @@ def form_valid(self, form): parts = partitions.save() try: + total_nodes_requested = {} for part in parts: part.vCPU_per_node = machine_info[part.machine_type]["vCPU"] // (1 if part.enable_hyperthreads else 2) cpu_count = machine_info[part.machine_type]["vCPU"] @@ -507,6 +508,28 @@ def form_valid(self, form): raise ValidationError( f"Invalid combination: machine_type {part.machine_type} cannot be used with disk_type {disk_type}." ) + + # Sum the total nodes for each reservation + if part.reservation_name: + if part.reservation_name not in total_nodes_requested: + total_nodes_requested[part.reservation_name] = 0 + total_nodes_requested[part.reservation_name] += part.dynamic_node_count + part.static_node_count + + # Validate total requested nodes against available nodes + for reservation_name, requested_nodes in total_nodes_requested.items(): + reservation = cloud_info.get_vm_reservations( + "GCP", + self.object.cloud_credential.detail, + None, + self.object.cloud_zone + ) + matching_reservation = reservation.get(reservation_name) + available_nodes = int(matching_reservation["instanceProperties"].get("availableCount", 0)) + if requested_nodes > available_nodes: + raise ValidationError(f"Reservation {reservation_name} does not have enough available nodes." + f"Requested: {requested_nodes}, Available: {available_nodes}" + ) + except KeyError as err: raise ValidationError("Error in Partition - invalid machine type: " f"{part.machine_type}") from err diff --git a/community/front-end/ofe/website/nginx.conf b/community/front-end/ofe/website/nginx.conf index 6c4d691f2a..457edc43d5 100644 --- a/community/front-end/ofe/website/nginx.conf +++ b/community/front-end/ofe/website/nginx.conf @@ -40,7 +40,7 @@ http { } location /static/ { - alias ../hpc-toolkit/community/front-end/ofe/website/static/; + alias ../cluster-toolkit/community/front-end/ofe/website/static/; } location / { diff --git a/community/front-end/ofe/website/website/settings.py b/community/front-end/ofe/website/website/settings.py index d0616e6151..5ac2a911ed 100644 --- a/community/front-end/ofe/website/website/settings.py +++ b/community/front-end/ofe/website/website/settings.py @@ -81,7 +81,7 @@ def get_site_name(): # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent -MEDIA_ROOT = "/opt/gcluster/hpc-toolkit/community/front-end/ofe/website/startup-scripts:" +MEDIA_ROOT = "/opt/gcluster/cluster-toolkit/community/front-end/ofe/website/startup-scripts:" # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/3.1/howto/deployment/checklist/ diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/community/modules/compute/htcondor-execute-point/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf +++ b/community/modules/compute/htcondor-execute-point/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 4e4e500f30..9353a60ede 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.39.0" } } diff --git a/community/modules/compute/mig/versions.tf b/community/modules/compute/mig/versions.tf index 2690b53cb0..0dd9be722b 100644 --- a/community/modules/compute/mig/versions.tf +++ b/community/modules/compute/mig/versions.tf @@ -22,6 +22,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:mig/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:mig/v1.39.0" } } diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index 0dd9b64775..dd68300561 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -75,7 +75,7 @@ No providers. | Name | Source | Version | |------|--------|---------| | [execution\_startup\_script](#module\_execution\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [pbs\_execution](#module\_pbs\_execution) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/compute/pbspro-execution/main.tf b/community/modules/compute/pbspro-execution/main.tf index b3e46cc2a8..9df32916a1 100644 --- a/community/modules/compute/pbspro-execution/main.tf +++ b/community/modules/compute/pbspro-execution/main.tf @@ -68,7 +68,7 @@ module "execution_startup_script" { } module "pbs_execution" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index e35c55bf3b..661022f0d9 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.39.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index e4947c1420..b5c8bfd98d 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.39.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 864b8933ad..e3b9a353ff 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -74,7 +74,7 @@ modules. For support with the underlying modules, see the instructions in the | Name | Source | Version | |------|--------|---------| -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.2 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf index f064171b67..5f692db27f 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/main.tf @@ -61,7 +61,7 @@ data "google_compute_default_service_account" "default" { module "slurm_nodeset_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.2" project_id = var.project_id region = var.region diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf index cc616cd258..b692ea8b17 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-dynamic/v1.39.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf index 783ba8e39a..d3669809df 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-tpu/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset-tpu/v1.39.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 62cddfeb48..0a53ef95e2 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -29,6 +29,14 @@ variable "node_conf" { description = "Map of Slurm node line configuration." type = map(any) default = {} + validation { + condition = lookup(var.node_conf, "Sockets", null) == null + error_message = <<-EOD + `Sockets` field is in conflict with `SocketsPerBoard` which is automatically generated by SlurmGCP. + Instead, you can override the following fields: `Boards`, `SocketsPerBoard`, `CoresPerSocket`, and `ThreadsPerCore`. + See: https://slurm.schedmd.com/slurm.conf.html#OPT_Boards and https://slurm.schedmd.com/slurm.conf.html#OPT_Sockets_1 + EOD + } } variable "node_count_static" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf index 38330af5d0..7963852a10 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-nodeset/v1.39.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md index 33beb3418f..e68899def7 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/README.md @@ -82,7 +82,7 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [exclusive](#input\_exclusive) | Exclusive job access to nodes. | `bool` | `true` | no | +| [exclusive](#input\_exclusive) | Exclusive job access to nodes. When set to true nodes execute single job and are deleted
after job exits. If set to false, multiple jobs can be scheduled on one node. | `bool` | `true` | no | | [is\_default](#input\_is\_default) | Sets this partition as the default partition by updating the partition\_conf.
If "Default" is already set in partition\_conf, this variable will have no effect. | `bool` | `false` | no | | [network\_storage](#input\_network\_storage) | DEPRECATED |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [nodeset](#input\_nodeset) | A list of nodesets.
For type definition see community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf::nodeset | `list(any)` | `[]` | no | @@ -91,7 +91,7 @@ No resources. | [partition\_conf](#input\_partition\_conf) | Slurm partition configuration as a map.
See https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION | `map(string)` | `{}` | no | | [partition\_name](#input\_partition\_name) | The name of the slurm partition. | `string` | n/a | yes | | [resume\_timeout](#input\_resume\_timeout) | Maximum time permitted (in seconds) between when a node resume request is issued and when the node is actually available for use.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'ResumeTimeout' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_ResumeTimeout_1 for details. | `number` | `300` | no | -| [suspend\_time](#input\_suspend\_time) | Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram.
This sets 'SuspendTime' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details.
NOTE: use value -1 to exclude partition from suspend. | `number` | `300` | no | +| [suspend\_time](#input\_suspend\_time) | Nodes which remain idle or down for this number of seconds will be placed into power save mode by SuspendProgram.
This sets 'SuspendTime' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details.
NOTE: use value -1 to exclude partition from suspend.
NOTE 2: if `var.exclusive` is set to true (default), nodes are deleted immediately after job finishes. | `number` | `300` | no | | [suspend\_timeout](#input\_suspend\_timeout) | Maximum time permitted (in seconds) between when a node suspend request is issued and when the node is shutdown.
If null is given, then a smart default will be chosen depending on nodesets in partition.
This sets 'SuspendTimeout' in partition\_conf.
See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTimeout_1 for details. | `number` | `null` | no | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf index 807ad3847e..e14e44b02a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/variables.tf @@ -41,7 +41,10 @@ variable "is_default" { } variable "exclusive" { - description = "Exclusive job access to nodes." + description = <<-EOD + Exclusive job access to nodes. When set to true nodes execute single job and are deleted + after job exits. If set to false, multiple jobs can be scheduled on one node. + EOD type = bool default = true } @@ -140,6 +143,7 @@ variable "suspend_time" { This sets 'SuspendTime' in partition_conf. See https://slurm.schedmd.com/slurm.conf.html#OPT_SuspendTime_1 for details. NOTE: use value -1 to exclude partition from suspend. + NOTE 2: if `var.exclusive` is set to true (default), nodes are deleted immediately after job finishes. EOD type = number default = 300 diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf index f0ea4295ce..51a4cedf2a 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/versions.tf @@ -18,6 +18,6 @@ terraform { required_version = ">= 1.3" provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-partition/v1.39.0" } } diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index e89f6764b6..5fbee1bf2e 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -86,6 +86,7 @@ No modules. | [sql\_password](#input\_sql\_password) | Password for the SQL database. | `any` | `null` | no | | [sql\_username](#input\_sql\_username) | Username for the SQL database | `string` | `"slurm"` | no | | [tier](#input\_tier) | The machine type to use for the SQL instance | `string` | n/a | yes | +| [user\_managed\_replication](#input\_user\_managed\_replication) | Replication parameters that will be used for defined secrets |
list(object({
location = string
kms_key_name = optional(string)
}))
| `[]` | no | ## Outputs diff --git a/community/modules/database/slurm-cloudsql-federation/main.tf b/community/modules/database/slurm-cloudsql-federation/main.tf index 09de939b72..6e2bfaceeb 100644 --- a/community/modules/database/slurm-cloudsql-federation/main.tf +++ b/community/modules/database/slurm-cloudsql-federation/main.tf @@ -19,6 +19,10 @@ locals { labels = merge(var.labels, { ghpc_module = "slurm-cloudsql-federation", ghpc_role = "database" }) } +locals { + user_managed_replication = var.user_managed_replication +} + resource "random_id" "resource_name_suffix" { byte_length = 4 } diff --git a/community/modules/database/slurm-cloudsql-federation/outputs.tf b/community/modules/database/slurm-cloudsql-federation/outputs.tf index 5f78c3adc8..21d8bbfcc9 100644 --- a/community/modules/database/slurm-cloudsql-federation/outputs.tf +++ b/community/modules/database/slurm-cloudsql-federation/outputs.tf @@ -18,9 +18,10 @@ output "cloudsql" { description = "Describes the cloudsql instance." sensitive = true value = { - server_ip = google_sql_database_instance.instance.ip_address[0].ip_address - user = google_sql_user.users.name - password = google_sql_user.users.password - db_name = google_sql_database.database.name + server_ip = google_sql_database_instance.instance.ip_address[0].ip_address + user = google_sql_user.users.name + password = google_sql_user.users.password + db_name = google_sql_database.database.name + user_managed_replication = local.user_managed_replication } } diff --git a/community/modules/database/slurm-cloudsql-federation/variables.tf b/community/modules/database/slurm-cloudsql-federation/variables.tf index 701f15d1ea..ec41c70e9d 100644 --- a/community/modules/database/slurm-cloudsql-federation/variables.tf +++ b/community/modules/database/slurm-cloudsql-federation/variables.tf @@ -96,3 +96,12 @@ variable "private_vpc_connection_peering" { type = string default = null } + +variable "user_managed_replication" { + type = list(object({ + location = string + kms_key_name = optional(string) + })) + description = "Replication parameters that will be used for defined secrets" + default = [] +} diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 37480f7cb9..c3e2e17f34 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.39.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index d649bf0ea0..1dc92b754e 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.39.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/nfs-server/README.md b/community/modules/file-system/nfs-server/README.md index 198e5014b6..c2fe8bebfd 100644 --- a/community/modules/file-system/nfs-server/README.md +++ b/community/modules/file-system/nfs-server/README.md @@ -126,7 +126,7 @@ No modules. | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used as name of the NFS instance if no name is specified. | `string` | n/a | yes | | [disk\_size](#input\_disk\_size) | Storage size gb | `number` | `"100"` | no | | [image](#input\_image) | DEPRECATED: The VM image used by the nfs server | `string` | `null` | no | -| [instance\_image](#input\_instance\_image) | The VM image used by the nfs server.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "hpc-centos-7",
"project": "cloud-hpc-image-public"
}
| no | +| [instance\_image](#input\_instance\_image) | The VM image used by the nfs server.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the NFS instance. Key-value pairs. | `map(string)` | n/a | yes | | [local\_mounts](#input\_local\_mounts) | Mountpoint for this NFS compute instance | `list(string)` |
[
"/data"
]
| no | | [machine\_type](#input\_machine\_type) | Type of the VM instance to use | `string` | `"n2d-standard-2"` | no | diff --git a/community/modules/file-system/nfs-server/variables.tf b/community/modules/file-system/nfs-server/variables.tf index c09564bc0e..c7db5b9cfa 100644 --- a/community/modules/file-system/nfs-server/variables.tf +++ b/community/modules/file-system/nfs-server/variables.tf @@ -72,7 +72,7 @@ variable "instance_image" { type = map(string) default = { project = "cloud-hpc-image-public" - family = "hpc-centos-7" + family = "hpc-rocky-linux-8" } validation { diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 0d08aa7deb..52b3087016 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.39.0" } required_version = ">= 0.14.0" diff --git a/community/modules/files/fsi-montecarlo-on-batch/versions.tf b/community/modules/files/fsi-montecarlo-on-batch/versions.tf index d3e1124ef4..fb590f7a6f 100644 --- a/community/modules/files/fsi-montecarlo-on-batch/versions.tf +++ b/community/modules/files/fsi-montecarlo-on-batch/versions.tf @@ -35,9 +35,9 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:fsi-montecarlo-on-batch/v1.39.0" } } diff --git a/community/modules/network/private-service-access/versions.tf b/community/modules/network/private-service-access/versions.tf index 3569a93f37..635a858afd 100644 --- a/community/modules/network/private-service-access/versions.tf +++ b/community/modules/network/private-service-access/versions.tf @@ -30,11 +30,11 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:private-service-access/v1.39.0" } required_version = ">= 1.2" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index c32a7e9ca6..a40251b1ea 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.39.0" } required_version = ">= 0.14.0" diff --git a/community/modules/pubsub/bigquery-sub/versions.tf b/community/modules/pubsub/bigquery-sub/versions.tf index d59dc83874..af06c5ff3f 100644 --- a/community/modules/pubsub/bigquery-sub/versions.tf +++ b/community/modules/pubsub/bigquery-sub/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:bigquery-sub/v1.39.0" } required_version = ">= 1.0" } diff --git a/community/modules/pubsub/topic/versions.tf b/community/modules/pubsub/topic/versions.tf index e9c1a1d319..2189a1688c 100644 --- a/community/modules/pubsub/topic/versions.tf +++ b/community/modules/pubsub/topic/versions.tf @@ -27,6 +27,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:topic/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:topic/v1.39.0" } } diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index 19ab5361f7..610cdab231 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -64,7 +64,7 @@ No providers. | Name | Source | Version | |------|--------|---------| | [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [instances](#module\_instances) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | ## Resources @@ -85,11 +85,11 @@ No resources. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. Requires virtual workstation accelerator if Nvidia Grid Drivers are required |
list(object({
type = string,
count = number
}))
|
[
{
"count": 1,
"type": "nvidia-tesla-t4-vws"
}
]
| no | | [install\_nvidia\_driver](#input\_install\_nvidia\_driver) | Installs the nvidia driver (true/false). For details, see https://cloud.google.com/compute/docs/gpus/install-drivers-gpu | `bool` | n/a | yes | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is
name="debian-12-bookworm-v20240312" and project="debian-cloud".
NOTE: uses fixed version of image to avoid NVIDIA driver compatibility issues.

An alternative image is from name="ubuntu-2204-jammy-v20240126" and project="ubuntu-os-cloud".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"name": "debian-12-bookworm-v20240312",
"project": "debian-cloud"
}
| no | +| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is
name="debian-12-bookworm-v20240815" and project="debian-cloud".
NOTE: uses fixed version of image to avoid NVIDIA driver compatibility issues.

An alternative image is from name="ubuntu-2204-jammy-v20240126" and project="ubuntu-os-cloud".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"name": "debian-12-bookworm-v20240815",
"project": "debian-cloud"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation. Must be N1 family if GPU is used. | `string` | `"n1-standard-8"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | -| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | +| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | | [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface
**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `enable_public_ips` also do not apply
to network interfaces defined in this variable.
Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network to attach the VM. | `string` | `"default"` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | diff --git a/community/modules/remote-desktop/chrome-remote-desktop/main.tf b/community/modules/remote-desktop/chrome-remote-desktop/main.tf index 936cc75d3c..023fc81ffa 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/main.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/main.tf @@ -71,7 +71,7 @@ module "client_startup_script" { } module "instances" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" instance_count = var.instance_count name_prefix = var.name_prefix diff --git a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf index 41916e70ff..df31cf5f34 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf @@ -58,7 +58,7 @@ variable "network_storage" { variable "instance_image" { description = <<-EOD Image used to build chrome remote desktop node. The default image is - name="debian-12-bookworm-v20240312" and project="debian-cloud". + name="debian-12-bookworm-v20240815" and project="debian-cloud". NOTE: uses fixed version of image to avoid NVIDIA driver compatibility issues. An alternative image is from name="ubuntu-2204-jammy-v20240126" and project="ubuntu-os-cloud". @@ -71,7 +71,7 @@ variable "instance_image" { type = map(string) default = { project = "debian-cloud" - name = "debian-12-bookworm-v20240312" + name = "debian-12-bookworm-v20240815" } } @@ -95,9 +95,9 @@ variable "auto_delete_boot_disk" { variable "name_prefix" { description = <<-EOT - An optional name for all VM and disk resources. - If not supplied, `deployment_name` will be used. - When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set, + An optional name for all VM and disk resources. + If not supplied, `deployment_name` will be used. + When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set, then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". EOT type = string diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 4473fa1c46..073d8a161a 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.39.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 60bdd4f8ac..f4bf842159 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.39.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 254362717d..604297e0b5 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.39.0" } required_version = ">= 1.3.0" diff --git a/community/modules/scheduler/pbspro-client/README.md b/community/modules/scheduler/pbspro-client/README.md index 43b71423fc..6f7a7b938c 100644 --- a/community/modules/scheduler/pbspro-client/README.md +++ b/community/modules/scheduler/pbspro-client/README.md @@ -75,7 +75,7 @@ No providers. | Name | Source | Version | |------|--------|---------| | [client\_startup\_script](#module\_client\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [pbs\_client](#module\_pbs\_client) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-client/main.tf b/community/modules/scheduler/pbspro-client/main.tf index e427c3945e..c1fa0e211f 100644 --- a/community/modules/scheduler/pbspro-client/main.tf +++ b/community/modules/scheduler/pbspro-client/main.tf @@ -57,7 +57,7 @@ module "client_startup_script" { } module "pbs_client" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/pbspro-server/README.md b/community/modules/scheduler/pbspro-server/README.md index f27384fc3c..e432e0cbbf 100644 --- a/community/modules/scheduler/pbspro-server/README.md +++ b/community/modules/scheduler/pbspro-server/README.md @@ -71,7 +71,7 @@ No providers. |------|--------|---------| | [pbs\_install](#module\_pbs\_install) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-install | v1.36.0&depth=1 | | [pbs\_qmgr](#module\_pbs\_qmgr) | github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/pbspro-qmgr | v1.36.0&depth=1 | -| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | v1.36.0&depth=1 | +| [pbs\_server](#module\_pbs\_server) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance | 09ae2725 | | [server\_startup\_script](#module\_server\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | ## Resources diff --git a/community/modules/scheduler/pbspro-server/main.tf b/community/modules/scheduler/pbspro-server/main.tf index bfbf635aa4..b5e924d969 100644 --- a/community/modules/scheduler/pbspro-server/main.tf +++ b/community/modules/scheduler/pbspro-server/main.tf @@ -70,7 +70,7 @@ module "server_startup_script" { } module "pbs_server" { - source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=v1.36.0&depth=1" + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/compute/vm-instance?ref=09ae2725" instance_count = var.instance_count spot = var.spot diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 50f719575a..6fef0fff97 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -70,7 +70,7 @@ activated through the `enable_reconfigure` setting: To reconfigure a running cluster: 1. Edit the blueprint with the desired configuration changes -1. Call `ghpc create -w` to overwrite the deployment directory +1. Call `gcluster create -w` to overwrite the deployment directory 1. Follow instructions in terminal to deploy The following are examples of updates that can be made to a running cluster: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index e3513e58be..c747ec3e35 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.39.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 3c5bb6bf5d..ab68579d98 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.39.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index dc16e79894..30ee38d084 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -49,7 +49,7 @@ partitions and slurm configuration in a running, active cluster. To reconfigure a running cluster: 1. Edit the blueprint with the desired configuration changes -2. Call `ghpc create -w` to overwrite the deployment directory +2. Call `gcluster create -w` to overwrite the deployment directory 3. Follow instructions in terminal to deploy The following are examples of updates that can be made to a running cluster: @@ -181,14 +181,12 @@ limitations under the License. |------|---------| | [terraform](#requirement\_terraform) | >= 1.3 | | [google](#requirement\_google) | >= 4.84 | -| [null](#requirement\_null) | >= 3.0 | ## Providers | Name | Version | |------|---------| | [google](#provider\_google) | >= 4.84 | -| [null](#provider\_null) | >= 3.0 | ## Modules @@ -196,13 +194,15 @@ limitations under the License. |------|--------|---------| | [bucket](#module\_bucket) | terraform-google-modules/cloud-storage/google | ~> 5.0 | | [daos\_network\_storage\_scripts](#module\_daos\_network\_storage\_scripts) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.36.0&depth=1 | -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.1 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | +| [nodeset\_cleanup](#module\_nodeset\_cleanup) | ./modules/cleanup_compute | n/a | +| [nodeset\_tpu\_cleanup](#module\_nodeset\_tpu\_cleanup) | ./modules/cleanup_compute | n/a | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.2 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.2 | | [slurm\_files](#module\_slurm\_files) | ./modules/slurm_files | n/a | -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.1 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | -| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.1 | -| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.1 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance | 6.6.2 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.2 | +| [slurm\_nodeset\_template](#module\_slurm\_nodeset\_template) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 6.6.2 | +| [slurm\_nodeset\_tpu](#module\_slurm\_nodeset\_tpu) | github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_nodeset_tpu | 6.6.2 | ## Resources @@ -213,8 +213,6 @@ limitations under the License. | [google_secret_manager_secret_version.cloudsql_version](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | | [google_storage_bucket_iam_binding.legacy_readers](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_iam_binding) | resource | | [google_storage_bucket_iam_binding.viewers](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_iam_binding) | resource | -| [null_resource.cleanup_compute](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | -| [null_resource.cleanup_compute_depenencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | | [google_compute_default_service_account.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | | [google_compute_image.slurm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | @@ -229,8 +227,8 @@ limitations under the License. | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | -| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access. |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
})
| `null` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | @@ -245,7 +243,7 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes and controller will be destroyed. | `bool` | `true` | no | +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes will be destroyed. | `bool` | `true` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_controller\_public\_ips](#input\_enable\_controller\_public\_ips) | If set to true. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf index d2c345cf8d..ac1535a92c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/controller.tf @@ -44,7 +44,7 @@ locals { # INSTANCE TEMPLATE module "slurm_controller_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.2" project_id = var.project_id region = var.region @@ -100,7 +100,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.2" access_config = var.enable_controller_public_ips ? [local.access_config] : [] add_hostname_suffix = false @@ -116,12 +116,7 @@ module "slurm_controller_instance" { zone = var.zone metadata = var.metadata - labels = merge(local.labels, local.files_cs_labels) - - depends_on = [ - # Ensure that controller is destroyed BEFORE doing cleanup - null_resource.cleanup_compute[0], - ] + labels = local.labels } # SECRETS: CLOUDSQL @@ -131,7 +126,27 @@ resource "google_secret_manager_secret" "cloudsql" { secret_id = "${local.slurm_cluster_name}-slurm-secret-cloudsql" replication { - auto {} + dynamic "auto" { + for_each = length(var.cloudsql.user_managed_replication) == 0 ? [1] : [] + content {} + } + dynamic "user_managed" { + for_each = length(var.cloudsql.user_managed_replication) == 0 ? [] : [1] + content { + dynamic "replicas" { + for_each = nonsensitive(var.cloudsql.user_managed_replication) + content { + location = replicas.value.location + dynamic "customer_managed_encryption" { + for_each = compact([replicas.value.kms_key_name]) + content { + kms_key_name = customer_managed_encryption.value + } + } + } + } + } + } } labels = { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf index e693dc22f8..00945db930 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/login.tf @@ -14,7 +14,7 @@ # TEMPLATE module "slurm_login_template" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=6.6.2" for_each = { for x in var.login_nodes : x.name_prefix => x } @@ -25,7 +25,6 @@ module "slurm_login_template" { name_prefix = each.value.name_prefix additional_disks = each.value.additional_disks - additional_networks = each.value.additional_networks bandwidth_tier = each.value.bandwidth_tier can_ip_forward = each.value.can_ip_forward disable_smt = each.value.disable_smt @@ -57,7 +56,7 @@ module "slurm_login_template" { # INSTANCE module "slurm_login_instance" { - source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.1" + source = "github.com/GoogleCloudPlatform/slurm-gcp.git//terraform/slurm_cluster/modules/_slurm_instance?ref=6.6.2" for_each = { for x in var.login_nodes : x.name_prefix => x } access_config = each.value.access_config @@ -69,13 +68,14 @@ module "slurm_login_instance" { slurm_cluster_name = local.slurm_cluster_name instance_template = module.slurm_login_template[each.key].self_link - labels = merge(each.value.labels, local.files_cs_labels) + labels = each.value.labels num_instances = each.value.num_instances - region = each.value.region - static_ips = each.value.static_ips - subnetwork = each.value.subnetwork - zone = each.value.zone + additional_networks = each.value.additional_networks + region = each.value.region + static_ips = each.value.static_ips + subnetwork = each.value.subnetwork + zone = each.value.zone # trigger replacement of login nodes when the controller instance is replaced replace_trigger = module.slurm_controller_instance.instances_self_links[0] diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf index 095d4efdbb..2e8cca6728 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/main.tf @@ -25,7 +25,6 @@ locals { tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) slurm_cluster_name = coalesce(var.slurm_cluster_name, local.tmp_cluster_name) - files_cs_labels = { slurm_files_checksum = module.slurm_files.checksum } universe_domain = { "universe_domain" = var.universe_domain } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/README.md new file mode 100644 index 0000000000..0405c09f78 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/README.md @@ -0,0 +1,41 @@ + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.3 | +| [null](#requirement\_null) | >= 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [null](#provider\_null) | >= 3.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [null_resource.dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | +| [null_resource.script](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes will be destroyed. | `bool` | n/a | yes | +| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
| n/a | yes | +| [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | n/a | yes | +| [nodeset](#input\_nodeset) | Nodeset to cleanup |
object({
nodeset_name = string
subnetwork_self_link = string
additional_networks = list(object({
subnetwork = string
}))
})
| n/a | yes | +| [project\_id](#input\_project\_id) | Project ID | `string` | n/a | yes | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Name of the Slurm cluster | `string` | n/a | yes | +| [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | n/a | yes | + +## Outputs + +No outputs. + diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf similarity index 68% rename from community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf rename to community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf index e1fdf74611..05d9b91cf1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/cleanup.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/main.tf @@ -14,36 +14,32 @@ locals { cleanup_dependencies_agg = flatten([ - [ - for ns in var.nodeset : [ - ns.subnetwork_self_link, - [for an in ns.additional_networks : an.subnetwork] - ] - ], - [for ns in var.nodeset_tpu : ns.subnetwork], - ]) + var.nodeset.subnetwork_self_link, + var.nodeset.additional_networks[*].subnetwork]) } -resource "null_resource" "cleanup_compute_depenencies" { +# Can not use variadic list in `depends_on`, wrap it into a collection of `null_resource` +resource "null_resource" "dependencies" { count = length(local.cleanup_dependencies_agg) } -resource "null_resource" "cleanup_compute" { +resource "null_resource" "script" { count = var.enable_cleanup_compute ? 1 : 0 triggers = { project_id = var.project_id - cluster_name = local.slurm_cluster_name + cluster_name = var.slurm_cluster_name + nodeset_name = var.nodeset.nodeset_name universe_domain = var.universe_domain compute_endpoint_version = var.endpoint_versions.compute gcloud_path_override = var.gcloud_path_override } provisioner "local-exec" { - command = "/bin/bash ${path.module}/scripts/cleanup_compute.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}" + command = "/bin/bash ${path.module}/scripts/cleanup_compute.sh ${self.triggers.project_id} ${self.triggers.cluster_name} ${self.triggers.nodeset_name} ${self.triggers.universe_domain} ${self.triggers.compute_endpoint_version} ${self.triggers.gcloud_path_override}" when = destroy } # Ensure that clean up is done before attempt to delete the networks - depends_on = [null_resource.cleanup_compute_depenencies] + depends_on = [null_resource.dependencies] } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh new file mode 100755 index 0000000000..671e7a0d27 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/scripts/cleanup_compute.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e -o pipefail + +project="$1" +cluster_name="$2" +nodeset_name="$3" +universe_domain="$4" +compute_endpoint_version="$5" +gcloud_dir="$6" + +if [[ $# -ne 5 ]] && [[ $# -ne 6 ]]; then + echo "Usage: $0 []" + exit 1 +fi + +if [[ -n "${gcloud_dir}" ]]; then + export PATH="$gcloud_dir:$PATH" +fi + +export CLOUDSDK_API_ENDPOINT_OVERRIDES_COMPUTE="https://www.${universe_domain}/compute/${compute_endpoint_version}/" +export CLOUDSDK_CORE_PROJECT="${project}" + +if ! type -P gcloud 1>/dev/null; then + echo "gcloud is not available and your compute resources are not being cleaned up" + echo "https://console.cloud.google.com/compute/instances?project=${project}" + exit 1 +fi + +echo "Deleting compute nodes" +node_filter="name:${cluster_name}-${nodeset_name}-* labels.slurm_cluster_name=${cluster_name} AND labels.slurm_instance_role=compute" + +tmpfile=$(mktemp) # have to use a temp file, since `< <(gcloud ...)` doesn't work nicely with `head` +trap 'rm -f "$tmpfile"' EXIT + +running_nodes_filter="${node_filter} AND (status!=STOPPING AND status!=TERMINATED)" +# List all currently running instances and attempt to delete them +gcloud compute instances list --format="value(selfLink)" --filter="${running_nodes_filter}" >"$tmpfile" +# Do 10 instances at a time +while batch="$(head -n 10)" && [[ ${#batch} -gt 0 ]]; do + nodes=$(echo "$batch" | paste -sd " " -) # concat into a single space-separated line + # The lack of quotes around ${nodes} is intentional and causes each new space-separated "word" to + # be treated as independent arguments. See PR#2523 + # shellcheck disable=SC2086 + gcloud compute instances delete --quiet ${nodes} || echo "Failed to delete some instances" +done <"$tmpfile" + +# In case if controller tries to delete the nodes as well, +# wait until nodes in STOPPING state are deleted, before deleting the resource policies +stopping_nodes_filter="${node_filter} AND status=STOPPING" +while true; do + node=$(gcloud compute instances list --format="value(name)" --filter="${stopping_nodes_filter}" --limit=1) + if [[ -z "${node}" ]]; then + break + fi + echo "Waiting for instances to be deleted: ${node}" + sleep 5 +done + +echo "Deleting resource policies" +policies_filter="name:${cluster_name}-${nodeset_name}-slurmgcp-managed-*" +gcloud compute resource-policies list --format="value(selfLink)" --filter="${policies_filter}" | while read -r line; do + echo "Deleting resource policy: $line" + gcloud compute resource-policies delete --quiet "${line}" || { + echo "Failed to delete resource policy: $line" + } +done diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/variables.tf new file mode 100644 index 0000000000..a40aab0f26 --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/cleanup_compute/variables.tf @@ -0,0 +1,66 @@ +/** + * Copyright (C) SchedMD LLC. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + type = string + description = "Project ID" +} + + +variable "slurm_cluster_name" { + type = string + description = "Name of the Slurm cluster" +} + +variable "enable_cleanup_compute" { + description = < [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. | `string` | `null` | no | | [bucket\_name](#input\_bucket\_name) | Name of GCS bucket to use. | `string` | n/a | yes | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Default behavior defined in scripts/conf.py |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | | [cloudsql\_secret](#input\_cloudsql\_secret) | Secret URI to cloudsql secret. | `string` | `null` | no | | [compute\_startup\_scripts](#input\_compute\_startup\_scripts) | List of scripts to be ran on compute VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | @@ -80,7 +84,7 @@ No modules. | [google\_app\_cred\_path](#input\_google\_app\_cred\_path) | Path to Google Application Credentials. | `string` | `null` | no | | [install\_dir](#input\_install\_dir) | Directory where the hybrid configuration directory will be installed on the
on-premise controller (e.g. /etc/slurm/hybrid). This updates the prefix path
for the resume and suspend scripts in the generated `cloud.conf` file.

This variable should be used when the TerraformHost and the SlurmctldHost
are different.

This will default to var.output\_dir if null. | `string` | `null` | no | | [job\_submit\_lua\_tpl](#input\_job\_submit\_lua\_tpl) | Slurm job\_submit.lua template file path. | `string` | `null` | no | -| [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
* server\_ip : Address of the storage server.
* remote\_mount : The location in the remote instance filesystem to mount from.
* local\_mount : The location on the instance filesystem to mount to.
* fs\_type : Filesystem type (e.g. "nfs").
* mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | +| [login\_network\_storage](#input\_login\_network\_storage) | Storage to mounted on login and controller instances
- server\_ip : Address of the storage server.
- remote\_mount : The location in the remote instance filesystem to mount from.
- local\_mount : The location on the instance filesystem to mount to.
- fs\_type : Filesystem type (e.g. "nfs").
- mount\_options : Options to mount with. |
list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
}))
| `[]` | no | | [login\_startup\_scripts](#input\_login\_startup\_scripts) | List of scripts to be ran on login VM startup. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [munge\_mount](#input\_munge\_mount) | Remote munge mount for compute and login nodes to acquire the munge.key.

By default, the munge mount server will be assumed to be the
`var.slurm_control_host` (or `var.slurm_control_addr` if non-null) when
`server_ip=null`. |
object({
server_ip = string
remote_mount = string
fs_type = string
mount_options = string
})
|
{
"fs_type": "nfs",
"mount_options": "",
"remote_mount": "/etc/munge/",
"server_ip": null
}
| no | @@ -106,7 +110,6 @@ No modules. | Name | Description | |------|-------------| -| [checksum](#output\_checksum) | Checksum of all files written to the bucket. | | [config](#output\_config) | Cluster configuration. | | [nodeset](#output\_nodeset) | Cluster nodesets. | | [nodeset\_dyn](#output\_nodeset\_dyn) | Cluster nodesets (dynamic). | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf index c25748dc48..0cf9981f5a 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/main.tf @@ -40,6 +40,8 @@ resource "random_uuid" "cluster_id" { ################## locals { + tp = "${local.bucket_dir}/" # prefix to trim from the bucket path to get a "file name" + config = { enable_slurm_gcp_plugins = var.enable_slurm_gcp_plugins enable_bigquery_load = var.enable_bigquery_load @@ -67,15 +69,6 @@ locals { epilog_scripts = [for k, v in google_storage_bucket_object.epilog_scripts : k] cloud_parameters = var.cloud_parameters - partitions = { for p in var.partitions : p.partition_name => p } - nodeset = { - for n in var.nodeset : n.nodeset_name => merge(n, { - instance_properties = jsondecode(n.instance_properties_json) - }) - } - nodeset_dyn = { for n in var.nodeset_dyn : n.nodeset_name => n } - nodeset_tpu = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } - # hybrid hybrid = var.enable_hybrid google_app_cred_path = var.enable_hybrid ? local.google_app_cred_path : null @@ -95,10 +88,30 @@ locals { # Providers endpoint_versions = var.endpoint_versions - } - config_yaml = "config.yaml" - config_yaml_bucket = format("%s/%s", local.bucket_dir, local.config_yaml) + # Extra-files MD5 hashes + # Makes config file creation depend on the files + # Allows for informed updates & checks on slurmsync side + slurm_gcp_scripts_md5 = google_storage_bucket_object.devel.md5hash, + controller_startup_scripts_md5 = { + for o in values(google_storage_bucket_object.controller_startup_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + compute_startup_scripts_md5 = { + for o in values(google_storage_bucket_object.compute_startup_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + nodeset_startup_scripts_md5 = { + for o in values(google_storage_bucket_object.nodeset_startup_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + login_startup_scripts_md5 = { + for o in values(google_storage_bucket_object.login_startup_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + prolog_scripts_md5 = { + for o in values(google_storage_bucket_object.prolog_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + epilog_scripts_md5 = { + for o in values(google_storage_bucket_object.epilog_scripts) : trimprefix(o.name, local.tp) => o.md5hash + } + } x_nodeset = toset(var.nodeset[*].nodeset_name) x_nodeset_dyn = toset(var.nodeset_dyn[*].nodeset_name) @@ -128,10 +141,44 @@ locals { resource "google_storage_bucket_object" "config" { bucket = data.google_storage_bucket.this.name - name = local.config_yaml_bucket + name = "${local.bucket_dir}/config.yaml" content = yamlencode(local.config) } +resource "google_storage_bucket_object" "parition_config" { + for_each = { for p in var.partitions : p.partition_name => p } + + bucket = data.google_storage_bucket.this.name + name = "${local.bucket_dir}/partition_configs/${each.key}.yaml" + content = yamlencode(each.value) +} + +resource "google_storage_bucket_object" "nodeset_config" { + for_each = { for ns in var.nodeset : ns.nodeset_name => merge(ns, { + instance_properties = jsondecode(ns.instance_properties_json) + }) } + + bucket = data.google_storage_bucket.this.name + name = "${local.bucket_dir}/nodeset_configs/${each.key}.yaml" + content = yamlencode(each.value) +} + +resource "google_storage_bucket_object" "nodeset_dyn_config" { + for_each = { for ns in var.nodeset_dyn : ns.nodeset_name => ns } + + bucket = data.google_storage_bucket.this.name + name = "${local.bucket_dir}/nodeset_dyn_configs/${each.key}.yaml" + content = yamlencode(each.value) +} + +resource "google_storage_bucket_object" "nodeset_tpu_config" { + for_each = { for n in var.nodeset_tpu[*].nodeset : n.nodeset_name => n } + + bucket = data.google_storage_bucket.this.name + name = "${local.bucket_dir}/nodeset_tpu_configs/${each.key}.yaml" + content = yamlencode(each.value) +} + ######### # DEVEL # ######### @@ -255,17 +302,6 @@ data "local_file" "setup_external" { } locals { - checksum = md5(join("", flatten([ - google_storage_bucket_object.config.md5hash, - google_storage_bucket_object.devel.md5hash, - [for k, f in google_storage_bucket_object.controller_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.compute_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.nodeset_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.login_startup_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.prolog_scripts : f.md5hash], - [for k, f in google_storage_bucket_object.epilog_scripts : f.md5hash] - ]))) - external_epilog = [{ filename = "z_external_epilog.sh" content = data.local_file.external_epilog.content diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf index 3b680b50a7..36cf0b646c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/outputs.tf @@ -53,8 +53,3 @@ output "nodeset_tpu" { description = "Cluster nodesets (TPU)." value = lookup(local.config, "nodeset_tpu", null) } - -output "checksum" { - description = "Checksum of all files written to the bucket." - value = local.checksum -} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py index eaa16abc23..c3b31f20a2 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/conf.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Optional, Iterable, Dict +from typing import List, Optional, Iterable, Dict, Set from itertools import chain from collections import defaultdict import json @@ -125,6 +125,7 @@ def get(key, default): "TreeWidth": get("tree_width", default_tree_width), "JobSubmitPlugins": "lua" if any_tpu else None, "TopologyPlugin": topology_plugin(lkp), + "TopologyParam": get("topology_param", "SwitchAsNodeRank"), } return dict_to_conf(conf_options, delim="\n") @@ -440,10 +441,60 @@ def render_conf_lines(self) -> Iterable[str]: for s in sorted(self.switches.values(), key=lambda s: s.name): yield from s.render_conf_lines() +class TopologySummary: + """ + Represents a summary of the topology, to make judgements about changes. + To be stored in JSON file along side of topology.conf to simplify parsing. + """ + def __init__( + self, + physical_host: Optional[Dict[str, str]] = None, + down_nodes: Optional[Iterable[str]] = None, + tpu_nodes: Optional[Iterable[str]] = None, + ) -> None: + self.physical_host = physical_host or {} + self.down_nodes = set(down_nodes or []) + self.tpu_nodes = set(tpu_nodes or []) + + + @classmethod + def loads(cls, s: str) -> "TopologySummary": + d = json.loads(s) + return cls( + physical_host=d.get("physical_host"), + down_nodes=d.get("down_nodes"), + tpu_nodes=d.get("tpu_nodes"), + ) + + def dumps(self) -> str: + return json.dumps( + { + "physical_host": self.physical_host, + "down_nodes": list(self.down_nodes), + "tpu_nodes": list(self.tpu_nodes), + }, + indent=2) + + def _nodenames(self) -> Set[str]: + return set(self.physical_host) | self.down_nodes | self.tpu_nodes + + def requires_reconfigure(self, prev: "TopologySummary") -> bool: + """ + Reconfigure IFF one of the following occurs: + * A node is added + * A node get a non-empty physicalHost + """ + if len(self._nodenames() - prev._nodenames()) > 0: + return True + for n, ph in self.physical_host.items(): + if ph and ph != prev.physical_host.get(n): + return True + return False class TopologyBuilder: def __init__(self) -> None: self._r = Switch("") # fake root, not part of the tree + self.summary = TopologySummary() def add(self, path: List[str], nodes: Iterable[str]) -> None: n = self._r @@ -460,6 +511,7 @@ def render_conf_lines(self) -> Iterable[str]: def compress(self) -> "TopologyBuilder": compressed = TopologyBuilder() + compressed.summary = self.summary def _walk( u: Switch, c: Switch ): # u: uncompressed node, c: its counterpart in compressed tree @@ -479,7 +531,9 @@ def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.L pref = ["tpu-root", f"ns_{nodeset.nodeset_name}"] if tpuobj.vmcount == 1: # Put all nodes in one switch - bldr.add(pref, list(chain(static, dynamic))) + all_nodes = list(chain(static, dynamic)) + bldr.add(pref, all_nodes) + bldr.summary.tpu_nodes.update(all_nodes) return # Chunk nodes into sub-switches of size `vmcount` @@ -488,16 +542,48 @@ def add_tpu_nodeset_topology(nodeset: object, bldr: TopologyBuilder, lkp: util.L for nodeschunk in util.chunked(nodenames, n=tpuobj.vmcount): chunk_name = f"{nodeset.nodeset_name}-{chunk_num}" chunk_num += 1 - bldr.add([*pref, chunk_name], list(nodeschunk)) + bldr.add([*pref, chunk_name], nodeschunk) + bldr.summary.tpu_nodes.update(nodeschunk) +_SLURM_TOPO_ROOT = "slurm-root" + +def _make_physical_path(physical_host: str) -> List[str]: + assert physical_host.startswith("/"), f"Unexpected physicalHost: {physical_host}" + parts = physical_host[1:].split("/") + # Due to issues with Slurm's topology plugin, we can not use all components of `physicalHost`, + # trim it down to `cluster/rack`. + short_path = parts[:2] + return [_SLURM_TOPO_ROOT, *short_path] def add_nodeset_topology( nodeset: object, bldr: TopologyBuilder, lkp: util.Lookup ) -> None: - path = ["slurm-root", f"ns_{nodeset.nodeset_name}"] - nodes = list(chain(*lkp.nodenames(nodeset))) - bldr.add(path, nodes) - + up_nodes = set() + default_path = [_SLURM_TOPO_ROOT, f"ns_{nodeset.nodeset_name}"] + + for inst in lkp.instances().values(): + try: + if lkp.node_nodeset_name(inst.name) != nodeset.nodeset_name: + continue + except Exception: + continue + + phys_host = inst.resourceStatus.get("physicalHost", "") + bldr.summary.physical_host[inst.name] = phys_host + up_nodes.add(inst.name) + + if phys_host: + bldr.add(_make_physical_path(phys_host), [inst.name]) + else: + bldr.add(default_path, [inst.name]) + + down_nodes = [] + for node in chain(*lkp.nodenames(nodeset)): + if node not in up_nodes: + down_nodes.append(node) + if down_nodes: + bldr.add(default_path, down_nodes) + bldr.summary.down_nodes.update(down_nodes) def gen_topology(lkp: util.Lookup) -> TopologyBuilder: bldr = TopologyBuilder() @@ -513,26 +599,35 @@ def gen_topology_conf(lkp: util.Lookup) -> bool: Generates slurm topology.conf. Returns whether the topology.conf got updated. """ - bldr = gen_topology(lkp).compress() + topo = gen_topology(lkp).compress() conf_file = lkp.etc_dir / "cloud_topology.conf" - old_hash = util.hash_file(conf_file) if conf_file.exists() else "" + with open(conf_file, "w") as f: f.writelines(FILE_PREAMBLE + "\n") - for line in bldr.render_conf_lines(): + for line in topo.render_conf_lines(): f.write(line) f.write("\n") f.write("\n") - new_hash = util.hash_file(conf_file) - return old_hash != new_hash + summary_file = lkp.etc_dir / "cloud_topology.summary.json" + prev_summary = TopologySummary() + if summary_file.exists(): + prev_summary = TopologySummary.loads(summary_file.read_text()) + summary_file.write_text(topo.summary.dumps()) + + return topo.summary.requires_reconfigure(prev_summary) def install_topology_conf(lkp: util.Lookup) -> None: conf_file = lkp.etc_dir / "cloud_topology.conf" + summary_file = lkp.etc_dir / "cloud_topology.summary.json" topo_conf = lkp.etc_dir / "topology.conf" + if not topo_conf.exists(): topo_conf.symlink_to(conf_file) + util.chown_slurm(conf_file, mode=0o600) + util.chown_slurm(summary_file, mode=0o600) def gen_controller_configs(lkp: util.Lookup) -> None: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py index 0e6a5074ca..354ec81ad3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/get_tpu_vmcount.py @@ -20,8 +20,8 @@ def get_vmcount_of_tpu_part(part): res = 0 - for ns in util.lkp.cfg.partitions[part].partition_nodeset_tpu: - tpu_obj = util.TPU(util.lkp.cfg.nodeset_tpu[ns]) + for ns in util.lookup().cfg.partitions[part].partition_nodeset_tpu: + tpu_obj = util.TPU(util.lookup().cfg.nodeset_tpu[ns]) if res == 0: res = tpu_obj.vmcount else: @@ -53,7 +53,7 @@ def get_vmcount_of_tpu_part(part): # valid equals to 0 means that we are ok, otherwise it will be set to one of the previously defined exit codes valid = 0 for part in args.partitions.split(","): - if part not in util.lkp.cfg.partitions: + if part not in util.lookup().cfg.partitions: valid = PART_INVALID break else: diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py index f876827a4c..800202d2ea 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/load_bq.py @@ -28,8 +28,7 @@ from google.api_core import retry, exceptions import util -from util import run -from util import cfg +from util import lookup, run SACCT = "sacct" @@ -51,6 +50,8 @@ def make_datetime(time_string): + if time_string == "None": + return None return datetime.strptime(time_string, SLURM_TIME_FORMAT).replace( tzinfo=timezone.utc ) @@ -176,14 +177,14 @@ def schema_field(field_name, data_type, description, required=False): Job = namedtuple("Job", job_schema.keys()) client = bq.Client( - project=cfg.project, + project=lookup().cfg.project, credentials=util.default_credentials(), client_options=util.create_client_options(util.ApiEndpoint.BQ), ) -dataset_id = f"{cfg.slurm_cluster_name}_job_data" -dataset = bq.DatasetReference(project=cfg.project, dataset_id=dataset_id) +dataset_id = f"{lookup().cfg.slurm_cluster_name}_job_data" +dataset = bq.DatasetReference(project=lookup().project, dataset_id=dataset_id) table = bq.Table( - bq.TableReference(dataset, f"jobs_{cfg.slurm_cluster_name}"), schema_fields + bq.TableReference(dataset, f"jobs_{lookup().cfg.slurm_cluster_name}"), schema_fields ) @@ -198,8 +199,8 @@ def make_job_row(job): if field_name in job } job_row["entry_uuid"] = uuid.uuid4().hex - job_row["cluster_id"] = cfg.cluster_id - job_row["cluster_name"] = cfg.slurm_cluster_name + job_row["cluster_id"] = lookup().cfg.cluster_id + job_row["cluster_name"] = lookup().cfg.slurm_cluster_name return job_row @@ -310,7 +311,7 @@ def update_job_idx_cache(jobs, timestamp): def main(): - if not cfg.enable_bigquery_load: + if not lookup().cfg.enable_bigquery_load: print("bigquery load is not currently enabled") exit(0) init_table() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index 6289268cdd..e20a7ed195 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -41,7 +41,7 @@ trim_self_link, wait_for_operation, ) -from util import cfg, lkp, NSDict, TPU +from util import lookup, NSDict, TPU import slurm_gcp_plugins @@ -61,8 +61,8 @@ def instance_properties(nodeset, model, placement_group, labels=None): props = NSDict() if labels: # merge in extra labels on instance and disks - template = lkp.node_template(model) - template_info = lkp.template_info(template) + template = lookup().node_template(model) + template_info = lookup().template_info(template) props.labels = {**template_info.labels, **labels} @@ -75,7 +75,6 @@ def instance_properties(nodeset, model, placement_group, labels=None): if placement_group: props.scheduling = { "onHostMaintenance": "TERMINATE", - "automaticRestart": False, } props.resourcePolicies = [placement_group] @@ -85,7 +84,7 @@ def instance_properties(nodeset, model, placement_group, labels=None): zones = list(nodeset.zone_policy_allow or []) assert len(zones) == 1, "Only single zone is supported if using a reservation" - reservation = lkp.reservation(reservation_name, zones[0]) + reservation = lookup().reservation(reservation_name, zones[0]) props.reservationAffinity = { "consumeReservationType": "SPECIFIC_RESERVATION", @@ -97,7 +96,6 @@ def instance_properties(nodeset, model, placement_group, labels=None): if policies: props.scheduling = { "onHostMaintenance": "TERMINATE", - "automaticRestart": False, } props.resourcePolicies = policies log.info( @@ -135,10 +133,10 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None # model here indicates any node that can be used to describe the rest model = next(iter(nodes)) - nodeset = lkp.node_nodeset(model) - template = lkp.node_template(model) - region = lkp.node_region(model) - partition = cfg.partitions[partition_name] + nodeset = lookup().node_nodeset(model) + template = lookup().node_template(model) + region = lookup().node_region(model) + partition = lookup().cfg.partitions[partition_name] log.debug(f"create_instances_request: {model} placement: {placement_group}") body = NSDict() @@ -173,16 +171,16 @@ def create_instances_request(nodes, partition_name, placement_group, job_id=None } body.locationPolicy.targetShape = nodeset.zone_target_shape - if lkp.cfg.enable_slurm_gcp_plugins: + if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.pre_instance_bulk_insert( - lkp=lkp, + lkp=lookup(), nodes=nodes, placement_group=placement_group, request_body=body, ) - request = lkp.compute.regionInstances().bulkInsert( - project=cfg.project, region=region, body=body.to_dict() + request = lookup().compute.regionInstances().bulkInsert( + project=lookup().project, region=region, body=body.to_dict() ) if log.isEnabledFor(logging.DEBUG): @@ -228,7 +226,7 @@ def group_nodes_bulk(nodes, resume_data=None): ) jobless_nodes_tpu = [] for jobless_node in jobless_nodes[:]: - if lkp.node_is_tpu(jobless_node): + if lookup().node_is_tpu(jobless_node): jobless_nodes.remove(jobless_node) jobless_nodes_tpu.append(jobless_node) @@ -268,7 +266,7 @@ def group_nodes_bulk(nodes, resume_data=None): for job_id, job in jobs.items() if not job.tpu for placement_group, pg_nodes in job.placement_groups.items() - for prefix, nodes in util.groupby_unsorted(pg_nodes, lkp.node_prefix) + for prefix, nodes in util.groupby_unsorted(pg_nodes, lookup().node_prefix) for i, chunk_nodes in enumerate(chunked(nodes, n=BULK_INSERT_LIMIT)) ] grouped_nodes_tpu = [ @@ -281,8 +279,8 @@ def group_nodes_bulk(nodes, resume_data=None): ) for job_id, job in jobs.items() if job.tpu - for prefix, nodes in util.groupby_unsorted(job.nodes_resume, lkp.node_prefix) - for i, chunk_nodes in enumerate(lkp.chunk_tpu_nodes(list(nodes))) + for prefix, nodes in util.groupby_unsorted(job.nodes_resume, lookup().node_prefix) + for i, chunk_nodes in enumerate(lookup().chunk_tpu_nodes(list(nodes))) ] def group_name(chunk: BulkChunk): @@ -339,7 +337,7 @@ def resume_nodes(nodes: List[str], resume_data=None): if resume_data is None and global_resume_data is not None: resume_data = global_resume_data.deepcopy() - nodes = sorted(nodes, key=lkp.node_prefix) + nodes = sorted(nodes, key=lookup().node_prefix) grouped_nodes, grouped_tpu_nodes = group_nodes_bulk(nodes, resume_data) if log.isEnabledFor(logging.DEBUG): @@ -365,7 +363,7 @@ def resume_nodes(nodes: List[str], resume_data=None): # do not create multiple tpu_objs if nodes with the same prefix are used if chunk.prefix not in tpu_objs.keys(): model = chunk.nodes[0] - tpu_objs[chunk.prefix] = TPU(lkp.node_nodeset(model)) + tpu_objs[chunk.prefix] = TPU(lookup().node_nodeset(model)) tpu_start_data.append({"tpu": tpu_objs[chunk.prefix], "node": chunk.nodes}) @@ -466,8 +464,8 @@ def update_job_comment(nodelist: str, comment: str): if any(map(lambda node: node in nodes, util.to_hostnames(job.nodelist_resume))) ) for job in job_list: - run(f"{lkp.scontrol} update jobid={job.job_id} admincomment='{comment}'") - run(f"{lkp.scontrol} notify {job.job_id} '{comment}'") + run(f"{lookup().scontrol} update jobid={job.job_id} admincomment='{comment}'") + run(f"{lookup().scontrol} notify {job.job_id} '{comment}'") def down_nodes(nodelist, reason): @@ -475,13 +473,13 @@ def down_nodes(nodelist, reason): if isinstance(nodelist, list): nodelist = util.to_hostlist(nodelist) update_job_comment(nodelist, reason) - run(f"{lkp.scontrol} update nodename={nodelist} state=down reason='{reason}'") + run(f"{lookup().scontrol} update nodename={nodelist} state=down reason='{reason}'") def hold_job(job_id, reason): """hold job, set comment to reason""" - run(f"{lkp.scontrol} hold jobid={job_id}") - run(f"{lkp.scontrol} update jobid={job_id} comment='{reason}'") + run(f"{lookup().scontrol} hold jobid={job_id}") + run(f"{lookup().scontrol} update jobid={job_id} comment='{reason}'") def create_placement_request(pg_name, region): @@ -492,12 +490,12 @@ def create_placement_request(pg_name, region): "collocation": "COLLOCATED", }, } - if lkp.cfg.enable_slurm_gcp_plugins: + if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.pre_placement_group_insert( - lkp=lkp, pg_name=pg_name, region=region, request_body=config + lkp=lookup(), pg_name=pg_name, region=region, request_body=config ) - request = lkp.compute.resourcePolicies().insert( - project=cfg.project, region=region, body=config + request = lookup().compute.resourcePolicies().insert( + project=lookup().project, region=region, body=config ) log_api_request(request) return request @@ -505,7 +503,7 @@ def create_placement_request(pg_name, region): def create_placement_groups(node_list: list, job_id=0): pgs = {} - node_map = lkp.nodeset_map(node_list) + node_map = lookup().nodeset_map(node_list) for _, nodes in node_map.items(): pgs.update(create_nodeset_placement_groups(nodes, job_id=job_id)) return pgs @@ -513,15 +511,15 @@ def create_placement_groups(node_list: list, job_id=0): def create_nodeset_placement_groups(node_list: list, job_id=0): model = next(iter(node_list)) - nodeset = lkp.node_nodeset(model) + nodeset = lookup().node_nodeset(model) if not nodeset.enable_placement: return {None: node_list} if not valid_placement_nodes(node_list): return {None: node_list} - region = lkp.node_region(model) + region = lookup().node_region(model) groups = { - f"{cfg.slurm_cluster_name}-{nodeset.nodeset_name}-{job_id}-{i}": nodes + f"{lookup().cfg.slurm_cluster_name}-slurmgcp-managed-{nodeset.nodeset_name}-{job_id}-{i}": nodes for i, nodes in enumerate(chunked(node_list, n=PLACEMENT_MAX_CNT)) } @@ -579,7 +577,7 @@ def classify_result(item): def valid_placement_nodes(nodelist): invalid_types = frozenset(["e2", "t2d", "n1", "t2a", "m1", "m2", "m3"]) for node in nodelist: - mt = lkp.node_template_info(node).machineType + mt = lookup().node_template_info(node).machineType if mt.split("-")[0] in invalid_types: log.warn(f"Unsupported machine type for placement policy: {mt}.") log.warn( @@ -608,7 +606,7 @@ def main(nodelist): log.debug(f"ResumeProgram {nodelist}") # Filter out nodes not in config.yaml other_nodes, pm_nodes = separate( - lkp.is_power_managed_node, util.to_hostnames(nodelist) + lookup().is_power_managed_node, util.to_hostnames(nodelist) ) if other_nodes: log.debug( @@ -626,7 +624,7 @@ def main(nodelist): resume_nodes(pm_nodes, global_resume_data) # TODO only run below if resume_nodes succeeds but # resume_nodes does not currently return any status. - if lkp.cfg.enable_slurm_gcp_plugins: + if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.post_main_resume_nodes( nodelist=nodelist, global_resume_data=global_resume_data ) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py index bee74a9cdf..589cfeadef 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup.py @@ -26,8 +26,7 @@ import util from util import ( - lkp, - cfg, + lookup, dirs, slurmdirs, run, @@ -102,10 +101,10 @@ def end_motd(broadcast=True): return run( - "wall -n '*** Slurm {} setup complete ***'".format(lkp.instance_role), + "wall -n '*** Slurm {} setup complete ***'".format(lookup().instance_role), timeout=30, ) - if lkp.instance_role != "controller": + if not lookup().is_controller: run( """wall -n ' /home on the controller was mounted over the existing /home. @@ -126,13 +125,13 @@ def failed_motd(): def run_custom_scripts(): """run custom scripts based on instance_role""" custom_dir = dirs.custom_scripts - if lkp.is_controller: + if lookup().is_controller: # controller has all scripts, but only runs controller.d custom_dirs = [custom_dir / "controller.d"] - elif lkp.instance_role == "compute": + elif lookup().instance_role == "compute": # compute setup with compute.d and nodeset.d custom_dirs = [custom_dir / "compute.d", custom_dir / "nodeset.d"] - elif lkp.instance_role == "login": + elif lookup().instance_role == "login": # login setup with only login.d custom_dirs = [custom_dir / "login.d"] else: @@ -150,11 +149,11 @@ def run_custom_scripts(): try: for script in custom_scripts: if "/controller.d/" in str(script): - timeout = lkp.cfg.get("controller_startup_scripts_timeout", 300) + timeout = lookup().cfg.get("controller_startup_scripts_timeout", 300) elif "/compute.d/" in str(script) or "/nodeset.d/" in str(script): - timeout = lkp.cfg.get("compute_startup_scripts_timeout", 300) + timeout = lookup().cfg.get("compute_startup_scripts_timeout", 300) elif "/login.d/" in str(script): - timeout = lkp.cfg.get("login_startup_scripts_timeout", 300) + timeout = lookup().cfg.get("login_startup_scripts_timeout", 300) else: timeout = 300 timeout = None if not timeout or timeout < 0 else timeout @@ -173,24 +172,9 @@ def run_custom_scripts(): log.error(f"script {script} did not complete within timeout={timeout}") raise e except Exception as e: - log.error(f"script {script} encountered an exception") - log.exception(e) + log.exception(f"script {script} encountered an exception") raise e - -def setup_secondary_disks(): - """Format and mount secondary disk""" - run( - "sudo mkfs.ext4 -m 0 -F -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/sdb" - ) - with open("/etc/fstab", "a") as f: - f.write( - "\n/dev/sdb {0} ext4 discard,defaults,nofail 0 2".format( - dirs.secdisk - ) - ) - - def setup_jwt_key(): jwt_key = Path(slurmdirs.state / "jwt_hs256.key") @@ -280,13 +264,13 @@ def configure_mysql(): timeout=30, ) run( - f"""{mysql} "drop user 'slurm'@'{lkp.control_host}'";""", + f"""{mysql} "drop user 'slurm'@'{lookup().control_host}'";""", timeout=30, check=False, ) - run(f"""{mysql} "create user 'slurm'@'{lkp.control_host}'";""", timeout=30) + run(f"""{mysql} "create user 'slurm'@'{lookup().control_host}'";""", timeout=30) run( - f"""{mysql} "grant all on slurm_acct_db.* TO 'slurm'@'{lkp.control_host}'";""", + f"""{mysql} "grant all on slurm_acct_db.* TO 'slurm'@'{lookup().control_host}'";""", timeout=30, ) @@ -294,27 +278,27 @@ def configure_mysql(): def configure_dirs(): for p in dirs.values(): util.mkdirp(p) - util.chown_slurm(dirs.slurm) - util.chown_slurm(dirs.scripts) - + + for p in (dirs.slurm, dirs.scripts, dirs.custom_scripts): + util.chown_slurm(p) + for p in slurmdirs.values(): util.mkdirp(p) util.chown_slurm(p) - etc_slurm = Path("/etc/slurm") - if etc_slurm.exists() and etc_slurm.is_symlink(): - etc_slurm.unlink() - etc_slurm.symlink_to(slurmdirs.etc) - - scripts_etc = dirs.scripts / "etc" - if scripts_etc.exists() and scripts_etc.is_symlink(): - scripts_etc.unlink() - scripts_etc.symlink_to(slurmdirs.etc) + for sl, tgt in ( # create symlinks + (Path("/etc/slurm"), slurmdirs.etc), + (dirs.scripts / "etc", slurmdirs.etc), + (dirs.scripts / "log", dirs.log), + ): + if sl.exists() and sl.is_symlink(): + sl.unlink() + sl.symlink_to(tgt) - scripts_log = dirs.scripts / "log" - if scripts_log.exists() and scripts_log.is_symlink(): - scripts_log.unlink() - scripts_log.symlink_to(dirs.log) + for f in ("sort_nodes.py",): # copy auxiliary scripts + dst = Path(lookup().cfg.slurm_bin_dir) / f + shutil.copyfile(util.scripts_dir / f, dst) + os.chmod(dst, 0o755) def setup_controller(): @@ -322,18 +306,15 @@ def setup_controller(): log.info("Setting up controller") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) install_custom_scripts() - conf.gen_controller_configs(lkp) + conf.gen_controller_configs(lookup()) setup_jwt_key() setup_munge_key() setup_sudoers() - - if cfg.controller_secondary_disk: - setup_secondary_disks() setup_network_storage() run_custom_scripts() - if not cfg.cloudsql_secret: + if not lookup().cfg.cloudsql_secret: configure_mysql() run("systemctl enable slurmdbd", timeout=30) @@ -344,7 +325,7 @@ def setup_controller(): sacctmgr = f"{slurmdirs.prefix}/bin/sacctmgr -i" result = run( - f"{sacctmgr} add cluster {cfg.slurm_cluster_name}", timeout=30, check=False + f"{sacctmgr} add cluster {lookup().cfg.slurm_cluster_name}", timeout=30, check=False ) if "already exists" in result.stdout: log.info(result.stdout) @@ -382,11 +363,11 @@ def setup_controller(): def setup_login(): """run login node setup""" log.info("Setting up login") - slurmctld_host = f"{lkp.control_host}" - if lkp.control_addr: - slurmctld_host = f"{lkp.control_host}({lkp.control_addr})" + slurmctld_host = f"{lookup().control_host}" + if lookup().control_addr: + slurmctld_host = f"{lookup().control_host}({lookup().control_addr})" slurmd_options = [ - f'--conf-server="{slurmctld_host}:{lkp.control_host_port}"', + f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"', f'--conf="Feature={conf.login_nodeset}"', "-Z", ] @@ -414,11 +395,11 @@ def setup_compute(): """run compute node setup""" log.info("Setting up compute") util.chown_slurm(dirs.scripts / "config.yaml", mode=0o600) - slurmctld_host = f"{lkp.control_host}" - if lkp.control_addr: - slurmctld_host = f"{lkp.control_host}({lkp.control_addr})" + slurmctld_host = f"{lookup().control_host}" + if lookup().control_addr: + slurmctld_host = f"{lookup().control_host}({lookup().control_addr})" slurmd_options = [ - f'--conf-server="{slurmctld_host}:{lkp.control_host_port}"', + f'--conf-server="{slurmctld_host}:{lookup().control_host_port}"', ] try: @@ -459,16 +440,30 @@ def setup_compute(): def main(): start_motd() - configure_dirs() + + log.info("Starting setup, fetching config") + sleep_seconds = 5 + while True: + try: + _, cfg = util.fetch_config() + util.update_config(cfg) + break + except util.DeffetiveStoredConfigError as e: + log.warning(f"config is not ready yet: {e}, sleeping for {sleep_seconds}s") + except Exception as e: + log.exception(f"unexpected error while fetching config, sleeping for {sleep_seconds}s") + time.sleep(sleep_seconds) + log.info("Config fetched") + configure_dirs() # call the setup function for the instance type { "controller": setup_controller, "compute": setup_compute, "login": setup_login, }.get( - lkp.instance_role, - lambda: log.fatal(f"Unknown node role: {lkp.instance_role}"))() + lookup().instance_role, + lambda: log.fatal(f"Unknown node role: {lookup().instance_role}"))() end_motd() @@ -478,8 +473,6 @@ def main(): parser.add_argument("--slurmd-feature", dest="slurmd_feature", help="Unused, to be removed.") _ = util.init_log_and_parse(parser) - lkp = util.Lookup(cfg) # noqa F811 - try: main() except subprocess.TimeoutExpired as e: @@ -508,7 +501,6 @@ def main(): ) log.error("Aborting setup...") failed_motd() - except Exception as e: - log.exception(e) - log.error("Aborting setup...") + except Exception: + log.exception("Aborting setup...") failed_motd() diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py index 65e5301481..80e4be9397 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/setup_network_storage.py @@ -27,7 +27,7 @@ from addict import Dict as NSDict import util -from util import lkp, run, cfg, dirs, separate +from util import lookup, run, dirs, separate from more_executors import Executors, ExceptionRetryPolicy @@ -41,21 +41,21 @@ def mounts_by_local(mounts): def resolve_network_storage(nodeset=None): """Combine appropriate network_storage fields to a single list""" - if lkp.instance_role == "compute": + if lookup().instance_role == "compute": try: - nodeset = lkp.node_nodeset() + nodeset = lookup().node_nodeset() except Exception: # External nodename, skip lookup nodeset = None # seed mounts with the default controller mounts - if cfg.disable_default_mounts: + if lookup().cfg.disable_default_mounts: default_mounts = [] else: default_mounts = [ NSDict( { - "server_ip": lkp.control_addr or lkp.control_host, + "server_ip": lookup().control_addr or lookup().control_host, "remote_mount": str(path), "local_mount": str(path), "fs_type": "nfs", @@ -73,9 +73,9 @@ def resolve_network_storage(nodeset=None): # On non-controller instances, entries in network_storage could overwrite # default exports from the controller. Be careful, of course - mounts.update(mounts_by_local(cfg.network_storage)) - if lkp.instance_role in ("login", "controller"): - mounts.update(mounts_by_local(cfg.login_network_storage)) + mounts.update(mounts_by_local(lookup().cfg.network_storage)) + if lookup().instance_role in ("login", "controller"): + mounts.update(mounts_by_local(lookup().cfg.login_network_storage)) if nodeset is not None: mounts.update(mounts_by_local(nodeset.network_storage)) @@ -89,7 +89,7 @@ def internal_mount(mount): # NOTE: Valid Lustre server_ip can take the form of '@tcp' server_ip = mount.server_ip.split("@")[0] mount_addr = util.host_lookup(server_ip) - return mount_addr == lkp.control_host_addr + return mount_addr == lookup().control_host_addr return separate(internal_mount, mounts) @@ -102,7 +102,7 @@ def setup_network_storage(): all_mounts = resolve_network_storage() ext_mounts, int_mounts = separate_external_internal_mounts(all_mounts) - if lkp.is_controller: + if lookup().is_controller: mounts = ext_mounts else: mounts = ext_mounts + int_mounts @@ -193,16 +193,16 @@ def mount_path(path): def munge_mount_handler(): - if not cfg.munge_mount: + if not lookup().cfg.munge_mount: log.error("Missing munge_mount in cfg") - elif lkp.is_controller: + elif lookup().is_controller: return - mount = cfg.munge_mount + mount = lookup().cfg.munge_mount server_ip = ( mount.server_ip if mount.server_ip - else (cfg.slurm_control_addr or cfg.slurm_control_host) + else (lookup().cfg.slurm_control_addr or lookup().cfg.slurm_control_host) ) remote_mount = mount.remote_mount local_mount = Path("/mnt/munge") @@ -213,8 +213,6 @@ def munge_mount_handler(): else "defaults,hard,intr,_netdev" ) - munge_key = Path(dirs.munge / "munge.key") - log.info(f"Mounting munge share to: {local_mount}") local_mount.mkdir() if fs_type.lower() == "gcsfuse".lower(): @@ -228,7 +226,7 @@ def munge_mount_handler(): ] else: if remote_mount is None: - remote_mount = Path("/etc/munge") + remote_mount = dirs.munge cmd = [ "mount", f"--types={fs_type}", @@ -252,6 +250,7 @@ def munge_mount_handler(): else: raise err + munge_key = Path(dirs.munge / "munge.key") log.info(f"Copy munge.key from: {local_mount}") shutil.copy2(Path(local_mount / "munge.key"), munge_key) @@ -276,18 +275,18 @@ def setup_nfs_exports(): mounts.append( NSDict( { - "server_ip": cfg.munge_mount.server_ip, - "remote_mount": cfg.munge_mount.remote_mount, + "server_ip": lookup().cfg.munge_mount.server_ip, + "remote_mount": lookup().cfg.munge_mount.remote_mount, "local_mount": Path(f"{dirs.munge}_tmp"), - "fs_type": cfg.munge_mount.fs_type, - "mount_options": cfg.munge_mount.mount_options, + "fs_type": lookup().cfg.munge_mount.fs_type, + "mount_options": lookup().cfg.munge_mount.mount_options, } ) ) # controller mounts _, con_mounts = separate_external_internal_mounts(mounts) con_mounts = {m.remote_mount: m for m in con_mounts} - for nodeset in cfg.nodeset.values(): + for nodeset in lookup().cfg.nodeset.values(): # get internal mounts for each nodeset by calling # resolve_network_storage as from a node in each nodeset ns_mounts = resolve_network_storage(nodeset=nodeset) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py index e648d6b80c..002057a3b1 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/slurmsync.py @@ -17,7 +17,6 @@ import argparse import datetime import fcntl -import hashlib import json import logging import re @@ -26,26 +25,24 @@ from itertools import chain from pathlib import Path import yaml +import datetime as dt +from datetime import datetime +from typing import Dict, Tuple import util from util import ( batch_execute, ensure_execute, execute_with_futures, - fetch_config_yaml, - fetch_config_yaml_md5, install_custom_scripts, - load_config_file, run, - save_config, separate, to_hostlist_fast, - Lookup, NSDict, TPU, chunked, ) -from util import lkp, CONFIG_FILE +from util import lookup from suspend import delete_instances from resume import start_tpu import conf @@ -72,9 +69,9 @@ def start_instance_op(inst): - return lkp.compute.instances().start( - project=lkp.project, - zone=lkp.instance(inst).zone, + return lookup().compute.instances().start( + project=lookup().project, + zone=lookup().instance(inst).zone, instance=inst, ) @@ -82,16 +79,14 @@ def start_instance_op(inst): def start_instances(node_list): log.info("{} instances to start ({})".format(len(node_list), ",".join(node_list))) - normal, tpu_nodes = separate(lkp.node_is_tpu, node_list) - invalid, valid = separate(lambda inst: bool(lkp.instance), normal) - - ops = {inst: start_instance_op(inst) for inst in valid} + normal, tpu_nodes = separate(lookup().node_is_tpu, node_list) + ops = {inst: start_instance_op(inst) for inst in normal} done, failed = batch_execute(ops) tpu_start_data = [] - for ns, nodes in util.groupby_unsorted(tpu_nodes, lkp.node_nodeset_name): - tpuobj = TPU(lkp.cfg.nodeset_tpu[ns]) + for ns, nodes in util.groupby_unsorted(tpu_nodes, lookup().node_nodeset_name): + tpuobj = TPU(lookup().cfg.nodeset_tpu[ns]) for snodes in chunked(nodes, n=tpuobj.vmcount): tpu_start_data.append({"tpu": tpuobj, "node": snodes}) execute_with_futures(start_tpu, tpu_start_data) @@ -105,14 +100,14 @@ def _find_dynamic_node_status() -> NodeStatus: def _find_tpu_node_status(nodename, state): - ns = lkp.node_nodeset(nodename) + ns = lookup().node_nodeset(nodename) tpuobj = TPU(ns) inst = tpuobj.get_node(nodename) # If we do not find the node but it is from a Tpu that has multiple vms look for the master node if inst is None and tpuobj.vmcount > 1: # Get the tpu slurm nodelist of the nodes in the same tpu group as nodename nodelist = run( - f"{lkp.scontrol} show topo {nodename}" + f"{lookup().scontrol} show topo {nodename}" + " | awk -F'=' '/Level=0/ { print $NF }'", shell=True, ).stdout @@ -142,7 +137,7 @@ def _find_tpu_node_status(nodename, state): & state.flags ): return NodeStatus.unbacked - if lkp.is_static_node(nodename): + if lookup().is_static_node(nodename): return NodeStatus.resume elif ( state is not None @@ -152,7 +147,7 @@ def _find_tpu_node_status(nodename, state): ): if tpuobj.preemptible: return NodeStatus.preempted - if not state.base.startswith("DOWN"): + if state.base != "DOWN": return NodeStatus.terminated elif ( state is None or "POWERED_DOWN" in state.flags @@ -166,16 +161,16 @@ def _find_tpu_node_status(nodename, state): def find_node_status(nodename): """Determine node/instance status that requires action""" - state = lkp.slurm_node(nodename) + state = lookup().slurm_node(nodename) - if lkp.node_is_dyn(nodename): + if lookup().node_is_dyn(nodename): return _find_dynamic_node_status() - if lkp.node_is_tpu(nodename): + if lookup().node_is_tpu(nodename): return _find_tpu_node_status(nodename, state) # split below is workaround for VMs whose hostname is FQDN - inst = lkp.instance(nodename.split(".")[0]) + inst = lookup().instance(nodename.split(".")[0]) power_flags = frozenset( ("POWER_DOWN", "POWERING_UP", "POWERING_DOWN", "POWERED_DOWN") ) & (state.flags if state is not None else set()) @@ -193,7 +188,7 @@ def find_node_status(nodename): return NodeStatus.unbacked if state.base == "DOWN" and not power_flags: return NodeStatus.power_down - if "POWERED_DOWN" in state.flags and lkp.is_static_node(nodename): + if "POWERED_DOWN" in state.flags and lookup().is_static_node(nodename): return NodeStatus.resume elif ( state is not None @@ -203,7 +198,7 @@ def find_node_status(nodename): ): if inst.scheduling.preemptible: return NodeStatus.preempted - if not state.base.startswith("DOWN"): + if state.base != "DOWN": return NodeStatus.terminated elif (state is None or "POWERED_DOWN" in state.flags) and inst.status == "RUNNING": log.info("%s is potential orphan node", nodename) @@ -235,8 +230,8 @@ def _seconds_since_timestamp(timestamp): """ if timestamp[-3] == ":": # python 36 datetime does not support the colon timestamp = timestamp[:-3] + timestamp[-2:] - creation_dt = datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%f%z") - return datetime.datetime.now().timestamp() - creation_dt.timestamp() + creation_dt = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S.%f%z") + return datetime.now().timestamp() - creation_dt.timestamp() def do_node_update(status, nodes): @@ -252,7 +247,7 @@ def nodes_down(): f"{count} nodes set down due to node status '{status.name}' ({hostlist})" ) run( - f"{lkp.scontrol} update nodename={hostlist} state=down reason='Instance stopped/deleted'" + f"{lookup().scontrol} update nodename={hostlist} state=down reason='Instance stopped/deleted'" ) def nodes_restart(): @@ -263,12 +258,12 @@ def nodes_restart(): def nodes_idle(): """idle nodes""" log.info(f"{count} nodes to idle ({hostlist})") - run(f"{lkp.scontrol} update nodename={hostlist} state=resume") + run(f"{lookup().scontrol} update nodename={hostlist} state=resume") def nodes_resume(): """resume nodes via scontrol""" log.info(f"{count} instances to resume ({hostlist})") - run(f"{lkp.scontrol} update nodename={hostlist} state=power_up") + run(f"{lookup().scontrol} update nodename={hostlist} state=power_up") def nodes_delete(): """delete instances for nodes""" @@ -278,38 +273,34 @@ def nodes_delete(): def nodes_power_down(): """power_down node in slurm""" log.info(f"{count} instances to power down ({hostlist})") - run(f"{lkp.scontrol} update nodename={hostlist} state=power_down") + run(f"{lookup().scontrol} update nodename={hostlist} state=power_down") def nodes_unknown(): """Error status, nodes shouldn't get in this status""" log.error(f"{count} nodes have unexpected status: ({hostlist})") first = next(iter(nodes)) - state = lkp.slurm_node(first) + state = lookup().slurm_node(first) state = "{}+{}".format(state.base, "+".join(state.flags)) if state else "None" - inst = lkp.instance(first) + inst = lookup().instance(first) log.error(f"{first} state: {state}, instance status:{inst.status}") - update = dict.get( - { - NodeStatus.orphan: nodes_delete, - NodeStatus.power_down: nodes_power_down, - NodeStatus.preempted: lambda: (nodes_down(), nodes_restart()), - NodeStatus.restore: nodes_idle, - NodeStatus.resume: nodes_resume, - NodeStatus.terminated: nodes_down, - NodeStatus.unbacked: nodes_down, - NodeStatus.unchanged: lambda: None, - NodeStatus.unknown: nodes_unknown, - }, - status, - ) - update() + { + NodeStatus.orphan: nodes_delete, + NodeStatus.power_down: nodes_power_down, + NodeStatus.preempted: lambda: (nodes_down(), nodes_restart()), + NodeStatus.restore: nodes_idle, + NodeStatus.resume: nodes_resume, + NodeStatus.terminated: nodes_down, + NodeStatus.unbacked: nodes_down, + NodeStatus.unchanged: lambda: None, + NodeStatus.unknown: nodes_unknown, + }[status]() def delete_placement_groups(placement_groups): def delete_placement_request(pg_name, region): - return lkp.compute.resourcePolicies().delete( - project=lkp.project, region=region, resourcePolicy=pg_name + return lookup().compute.resourcePolicies().delete( + project=lookup().project, region=region, resourcePolicy=pg_name ) requests = { @@ -348,18 +339,18 @@ def sync_placement_groups(): keep_jobs = { str(job["job_id"]) - for job in json.loads(run(f"{lkp.scontrol} show jobs --json").stdout)["jobs"] + for job in json.loads(run(f"{lookup().scontrol} show jobs --json").stdout)["jobs"] if "job_state" in job and set(job["job_state"]) & keep_states } keep_jobs.add("0") # Job 0 is a placeholder for static node placement fields = "items.regions.resourcePolicies,nextPageToken" - flt = f"name={lkp.cfg.slurm_cluster_name}-*" - act = lkp.compute.resourcePolicies() - op = act.aggregatedList(project=lkp.project, fields=fields, filter=flt) + flt = f"name={lookup().cfg.slurm_cluster_name}-*" + act = lookup().compute.resourcePolicies() + op = act.aggregatedList(project=lookup().project, fields=fields, filter=flt) placement_groups = {} pg_regex = re.compile( - rf"{lkp.cfg.slurm_cluster_name}-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" + rf"{lookup().cfg.slurm_cluster_name}-(?P[^\s\-]+)-(?P\d+)-(?P\d+)" ) while op is not None: result = ensure_execute(op) @@ -384,9 +375,9 @@ def sync_placement_groups(): def sync_slurm(): compute_instances = [ - name for name, inst in lkp.instances().items() if inst.role == "compute" + name for name, inst in lookup().instances().items() if inst.role == "compute" ] - slurm_nodes = list(lkp.slurm_nodes().keys()) + slurm_nodes = list(lookup().slurm_nodes().keys()) all_nodes = list( set( @@ -413,55 +404,35 @@ def sync_slurm(): do_node_update(status, nodes) -def read_hash(filename): - filename = Path(filename) - if not filename.exists(): - return None - with open(filename, "r", encoding="utf-8") as file: - return file.readline() - - -def save_hash(filename, hash): - with open(filename, "w+", encoding="utf-8") as file: - file.write(hash) - - def reconfigure_slurm(): - CONFIG_HASH = Path("/slurm/scripts/.config.hash") update_msg = "*** slurm configuration was updated ***" - cfg_old = load_config_file(CONFIG_FILE) - - if cfg_old.hybrid: + if lookup().cfg.hybrid: # terraform handles generating the config.yaml, don't do it here return - - hash_new: hashlib.md5 = fetch_config_yaml_md5() - hash_old: str = read_hash(CONFIG_HASH) - - if hash_new.hexdigest() != hash_old: - log.debug("Delta detected. Reconfiguring Slurm now.") - cfg_new = fetch_config_yaml() - save_hash(CONFIG_HASH, hash_new.hexdigest()) - save_config(cfg_new, CONFIG_FILE) - cfg_new = load_config_file(CONFIG_FILE) - lkp = Lookup(cfg_new) - util.lkp = lkp - if lkp.is_controller: - conf.gen_controller_configs(lkp) - log.info("Restarting slurmctld to make changes take effect.") - try: - # TODO: consider removing "restart" since "reconfigure" should restart slurmctld as well - run("sudo systemctl restart slurmctld.service", check=False) - util.scontrol_reconfigure(lkp) - except Exception: - log.exception("failed to reconfigure slurmctld") - util.run(f"wall '{update_msg}'", timeout=30) - log.debug("Done.") - elif lkp.instance_role_safe in ["compute", "login"]: - log.info("Restarting slurmd to make changes take effect.") - run("systemctl restart slurmd") - util.run(f"wall '{update_msg}'", timeout=30) - log.debug("Done.") + + upd, cfg_new = util.fetch_config() + if not upd: + log.debug("No changes in config detected.") + return + log.debug("Changes in config detected. Reconfiguring Slurm now.") + util.update_config(cfg_new) + + if lookup().is_controller: + conf.gen_controller_configs(lookup()) + log.info("Restarting slurmctld to make changes take effect.") + try: + # TODO: consider removing "restart" since "reconfigure" should restart slurmctld as well + run("sudo systemctl restart slurmctld.service", check=False) + util.scontrol_reconfigure(lookup()) + except Exception: + log.exception("failed to reconfigure slurmctld") + util.run(f"wall '{update_msg}'", timeout=30) + log.debug("Done.") + elif lookup().instance_role_safe in ["compute", "login"]: + log.info("Restarting slurmd to make changes take effect.") + run("systemctl restart slurmd") + util.run(f"wall '{update_msg}'", timeout=30) + log.debug("Done.") def update_topology(lkp: util.Lookup) -> None: @@ -472,27 +443,113 @@ def update_topology(lkp: util.Lookup) -> None: log.debug("Topology configuration updated. Reconfiguring Slurm.") util.scontrol_reconfigure(lkp) + +def delete_reservation(lkp: util.Lookup, reservation_name: str) -> None: + util.run(f"{lkp.scontrol} delete reservation {reservation_name}") + + +def create_reservation(lkp: util.Lookup, reservation_name: str, node: str, start_time: datetime) -> None: + # Format time to be compatible with slurm reservation. + formatted_start_time = start_time.strftime('%Y-%m-%dT%H:%M:%S') + util.run(f"{lkp.scontrol} create reservation user=slurm starttime={formatted_start_time} duration=180 nodes={node} reservationname={reservation_name}") + + +def get_slurm_reservation_maintenance(lkp: util.Lookup) -> Dict[str, datetime]: + res = util.run(f"{lkp.scontrol} show reservation --json") + all_reservations = json.loads(res.stdout) + reservation_map = {} + + for reservation in all_reservations['reservations']: + name = reservation.get('name') + nodes = reservation.get('node_list') + time_epoch = reservation.get('start_time', {}).get('number') + + if name is None or nodes is None or time_epoch is None: + continue + + if reservation.get('node_count') != 1: + continue + + if name != f"{nodes}_maintenance": + continue + + reservation_map[name] = datetime.fromtimestamp(time_epoch) + + return reservation_map + + +def get_upcoming_maintenance(lkp: util.Lookup) -> Dict[str, Tuple[str, datetime]]: + upc_maint_map = {} + + for node, properties in lkp.instances().items(): + if 'upcomingMaintenance' in properties: + start_time = datetime.strptime(properties['upcomingMaintenance']['startTimeWindow']['earliest'], '%Y-%m-%dT%H:%M:%S%z') + upc_maint_map[node + "_maintenance"] = (node, start_time) + + return upc_maint_map + + +def sync_maintenance_reservation(lkp: util.Lookup) -> None: + upc_maint_map = get_upcoming_maintenance(lkp) # map reservation_name -> (node_name, time) + log.debug(f"upcoming-maintenance-vms: {upc_maint_map}") + + curr_reservation_map = get_slurm_reservation_maintenance(lkp) # map reservation_name -> time + log.debug(f"curr-reservation-map: {curr_reservation_map}") + + del_reservation = set(curr_reservation_map.keys() - upc_maint_map.keys()) + create_reservation_map = {} + + for res_name, (node, start_time) in upc_maint_map.items(): + if res_name in curr_reservation_map: + diff = curr_reservation_map[res_name] - start_time + if abs(diff) <= dt.timedelta(seconds=1): + continue + else: + del_reservation.add(res_name) + create_reservation_map[res_name] = (node, start_time) + else: + create_reservation_map[res_name] = (node, start_time) + + log.debug(f"del-reservation: {del_reservation}") + for res_name in del_reservation: + delete_reservation(lkp, res_name) + + log.debug(f"create-reservation-map: {create_reservation_map}") + for res_name, (node, start_time) in create_reservation_map.items(): + create_reservation(lkp, res_name, node, start_time) + + def main(): try: reconfigure_slurm() except Exception: log.exception("failed to reconfigure slurm") - if lkp.is_controller: + if lookup().is_controller: try: sync_slurm() except Exception: log.exception("failed to sync instances") + try: sync_placement_groups() except Exception: log.exception("failed to sync placement groups") + try: - update_topology(lkp) + update_topology(lookup()) except Exception: log.exception("failed to update topology") + ## TODO: Enable reservation for scheduled maintenance. + # try: + # sync_maintenance_reservation(lookup()) + # except Exception: + # log.exception("failed to sync slurm reservation for scheduled maintenance") + try: + # TODO: it performs 1 to 4 GCS list requests, + # use cached version, combine with `_list_config_blobs` install_custom_scripts(check_hash=True) except Exception: log.exception("failed to sync custom scripts") diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py new file mode 100755 index 0000000000..1747742d1e --- /dev/null +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/sort_nodes.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script sorts nodes based on their `physicalHost`. + +See https://cloud.google.com/compute/docs/instances/use-compact-placement-policies + +You can reduce latency in tightly coupled HPC workloads (including distributed ML training) +by deploying them to machines that are located close together. +For example, if you deploy your workload on a single physical rack, you can expect lower latency +than if your workload is spread across multiple racks. +Sending data across multiple rack requires sending data through additional network switches. + +Example usage: +``` my_sbatch.sh +#SBATCH --ntasks-per-node=8 +#SBATCH --nodes=64 + +export SLURM_HOSTFILE=$(sort_nodes.py) + +srun -l hostname | sort +``` +""" +import os +import subprocess +import uuid +from typing import List, Optional, Dict +from collections import OrderedDict + +def order(paths: List[List[str]]) -> List[str]: + """ + Orders the leaves of the tree in a way that minimizes the sum of distance in between + each pair of neighboring nodes in the resulting order. + The resulting order will always start from the first node in the input list. + The ordering is "stable" with respect to the input order of the leaves i.e. + given a choice between two nodes (identical in other ways) it will select "nodelist-smallest" one. + + Returns a list of nodenames, ordered as described above. + """ + if not paths: return [] + class Vert: + "Represents a vertex in a *network* tree." + def __init__(self, name: str, parent: "Vert"): + self.name = name + self.parent = parent + # Use `OrderedDict` to preserve insertion order + # TODO: once we move to Python 3.7+ use regular `dict` since it has the same guarantee + self.children = OrderedDict() + + # build a tree, children are ordered by insertion order + root = Vert("", None) + for path in paths: + n = root + for v in path: + if v not in n.children: + n.children[v] = Vert(v, n) + n = n.children[v] + + # walk the tree in insertion order, gather leaves + result = [] + def gather_nodes(v: Vert) -> None: + if not v.children: # this is a Slurm node + result.append(v.name) + for u in v.children.values(): + gather_nodes(u) + gather_nodes(root) + return result + + +class Instance: + def __init__(self, name: str, zone: str, physical_host: Optional[str]): + self.name = name + self.zone = zone + self.physical_host = physical_host + + +def make_path(node_name: str, inst: Optional[Instance]) -> List[str]: + if not inst: # node with unknown instance (e.g. hybrid cluster) + return ["unknown", node_name] + zone = f"zone_{inst.zone}" + if not inst.physical_host: # node without physical host info (e.g. no placement policy) + return [zone, "unknown", node_name] + + assert inst.physical_host.startswith("/"), f"Unexpected physicalHost: {inst.physical_host}" + parts = inst.physical_host[1:].split("/") + if len(parts) >= 4: + return [*parts, node_name] + return [zone, *parts, node_name] + + +def to_hostnames(nodelist: str) -> List[str]: + cmd = ["scontrol", "show", "hostnames", nodelist] + out = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout + return [n.decode("utf-8") for n in out.splitlines()] + + +def get_instances(node_names: List[str]) -> Dict[str, object]: + fmt = ( + "--format=csv[no-heading,separator=','](zone,resourceStatus.physicalHost,name)" + ) + cmd = ["gcloud", "compute", "instances", "list", fmt] + + scp = os.path.commonprefix(node_names) + if scp: + cmd.append(f"--filter=name~'{scp}.*'") + out = subprocess.run(cmd, check=True, stdout=subprocess.PIPE).stdout + d = {} + for line in out.splitlines(): + zone, physical_host, name = line.decode("utf-8").split(",") + d[name] = Instance(name, zone, physical_host) + return {n: d.get(n) for n in node_names} + + +def main(args) -> None: + nodelist = args.nodelist or os.getenv("SLURM_NODELIST") + if not nodelist: + raise ValueError("nodelist is not provided and SLURM_NODELIST is not set") + + if args.ntasks_per_node is None: + args.ntasks_per_node = int(os.getenv("SLURM_NTASKS_PER_NODE", "") or 1) + assert args.ntasks_per_node > 0 + + output = args.output or f"hosts.{uuid.uuid4()}" + + node_names = to_hostnames(nodelist) + instannces = get_instances(node_names) + paths = [make_path(n, instannces[n]) for n in node_names] + ordered = order(paths) + + with open(output, "w") as f: + for node in ordered: + for _ in range(args.ntasks_per_node): + f.write(node) + f.write("\n") + print(output) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + "--nodelist", + type=str, + help="Slurm 'hostlist expression' of nodes to sort, if not set the value of SLURM_NODELIST environment variable will be used", + ) + parser.add_argument( + "--ntasks-per-node", + type=int, + help="""Number of times to repeat each node in resulting sorted list. +If not set, the value of SLURM_NTASKS_PER_NODE environment variable will be used, +if neither is set, defaults to 1""", + ) + parser.add_argument( + "--output", type=str, help="Output file to write, defaults to 'hosts.'" + ) + args = parser.parse_args() + main(args) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py index 9848e5a995..4866dffb1e 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/suspend.py @@ -29,7 +29,7 @@ separate, execute_with_futures, ) -from util import lkp, TPU +from util import lookup, TPU import slurm_gcp_plugins @@ -49,9 +49,9 @@ def truncate_iter(iterable, max_count): def delete_instance_request(instance): - request = lkp.compute.instances().delete( - project=lkp.project, - zone=lkp.instance(instance).zone, + request = lookup().compute.instances().delete( + project=lookup().project, + zone=lookup().instance(instance).zone, instance=instance, ) log_api_request(request) @@ -74,10 +74,10 @@ def stop_tpu(data): def delete_tpu_instances(instances): stop_data = [] - for prefix, nodes in util.groupby_unsorted(instances, lkp.node_prefix): + for prefix, nodes in util.groupby_unsorted(instances, lookup().node_prefix): log.info(f"Deleting TPU nodes from prefix {prefix}") lnodes = list(nodes) - tpu_nodeset = lkp.node_nodeset(lnodes[0]) + tpu_nodeset = lookup().node_nodeset(lnodes[0]) tpu = TPU(tpu_nodeset) stop_data.extend( [{"tpu": tpu, "node": node, "nodeset": tpu_nodeset} for node in lnodes] @@ -87,7 +87,7 @@ def delete_tpu_instances(instances): def delete_instances(instances): """delete instances individually""" - invalid, valid = separate(lambda inst: bool(lkp.instance(inst)), instances) + invalid, valid = separate(lambda inst: bool(lookup().instance(inst)), instances) if len(invalid) > 0: log.debug("instances do not exist: {}".format(",".join(invalid))) if len(valid) == 0: @@ -109,7 +109,7 @@ def delete_instances(instances): def suspend_nodes(nodes: List[str]) -> None: tpu_nodes, other_nodes = [], [] for node in nodes[:]: - if lkp.node_is_tpu(node): + if lookup().node_is_tpu(node): tpu_nodes.append(node) else: other_nodes.append(node) @@ -124,7 +124,7 @@ def main(nodelist): # Filter out nodes not in config.yaml other_nodes, pm_nodes = separate( - lkp.is_power_managed_node, util.to_hostnames(nodelist) + lookup().is_power_managed_node, util.to_hostnames(nodelist) ) if other_nodes: log.debug( @@ -137,8 +137,8 @@ def main(nodelist): return log.info(f"suspend {nodelist}") - if lkp.cfg.enable_slurm_gcp_plugins: - slurm_gcp_plugins.pre_main_suspend_nodes(lkp=lkp, nodelist=nodelist) + if lookup().cfg.enable_slurm_gcp_plugins: + slurm_gcp_plugins.pre_main_suspend_nodes(lkp=lookup(), nodelist=nodelist) suspend_nodes(pm_nodes) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py index e95e436b0c..8db9add6c3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/common.py @@ -70,6 +70,18 @@ class TstMachineConf: class TstTemplateInfo: gpu_count: int = 0 +@dataclass +class TstInstance: + name: str + region: str = "gondor" + zone: str = "anorien" + placementPolicyId: Optional[str] = None + physicalHost: Optional[str] = None + + @property + def resourceStatus(self): + return {"physicalHost": self.physicalHost} + def make_to_hostnames_mock(tbl: Optional[dict[str, list[str]]]): tbl = tbl or {} diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt index 85c80e7f84..fc44e31a6c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/requirements.txt @@ -1,3 +1,4 @@ pytest pytest-mock +pytest_unordered mock diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py index 56a94ba187..0b25b0df58 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_conf.py @@ -100,7 +100,8 @@ def test_dict_to_conf(value: dict, want: str): SuspendRate=0 SuspendTimeout=300 TreeWidth=128 -TopologyPlugin=topology/tree"""), +TopologyPlugin=topology/tree +TopologyParam=SwitchAsNodeRank"""), (TstCfg( install_dir="ukulele", cloud_parameters={ @@ -110,6 +111,7 @@ def test_dict_to_conf(value: dict, want: str): "suspend_rate": None, "suspend_timeout": None, "topology_plugin": None, + "topology_param": None, "tree_width": None, }, ), @@ -121,7 +123,8 @@ def test_dict_to_conf(value: dict, want: str): SuspendRate=0 SuspendTimeout=300 TreeWidth=128 -TopologyPlugin=topology/tree"""), +TopologyPlugin=topology/tree +TopologyParam=SwitchAsNodeRank"""), (TstCfg( install_dir="ukulele", cloud_parameters={ @@ -131,6 +134,7 @@ def test_dict_to_conf(value: dict, want: str): "suspend_rate": 3, "suspend_timeout": 4, "topology_plugin": "guess", + "topology_param": "yellow", "tree_width": 5, }, ), @@ -142,7 +146,8 @@ def test_dict_to_conf(value: dict, want: str): SuspendRate=3 SuspendTimeout=4 TreeWidth=5 -TopologyPlugin=guess"""), +TopologyPlugin=guess +TopologyParam=yellow"""), ]) def test_conflines(cfg, want): assert conf.conflines(util.Lookup(cfg)) == want diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py index 3dc86dcd21..6d44338c81 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_topology.py @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pytest +import json import mock -from common import TstCfg, TstNodeset, TstTPU, make_to_hostnames_mock +from pytest_unordered import unordered +from common import TstCfg, TstNodeset, TstTPU, TstInstance +import sort_nodes import util import conf @@ -56,12 +60,33 @@ def tpu_se(ns: TstNodeset) -> TstTPU: tpu_mock.side_effect = tpu_se lkp = util.Lookup(cfg) + lkp.instances = lambda: { n.name: n for n in [ + # nodeset blue + TstInstance("m22-blue-0"), # no physicalHost + TstInstance("m22-blue-0", physicalHost="/a/a/a"), + TstInstance("m22-blue-1", physicalHost="/a/a/b"), + TstInstance("m22-blue-2", physicalHost="/a/b/a"), + TstInstance("m22-blue-3", physicalHost="/b/a/a"), + # nodeset green + TstInstance("m22-green-3", physicalHost="/a/a/c"), + ]} + uncompressed = conf.gen_topology(lkp) - want_uncompressed = [ - "SwitchName=slurm-root Switches=ns_blue,ns_green,ns_pink", - "SwitchName=ns_blue Nodes=m22-blue-[0-6]", - "SwitchName=ns_green Nodes=m22-green-[0-4]", + want_uncompressed = [ + #NOTE: the switch names are not unique, it's not valid content for topology.conf + # The uniquefication and compression of names are done in the compress() method + "SwitchName=slurm-root Switches=a,b,ns_blue,ns_green,ns_pink", + # "physical" topology + 'SwitchName=a Switches=a,b', + 'SwitchName=a Nodes=m22-blue-[0-1],m22-green-3', + 'SwitchName=b Nodes=m22-blue-2', + 'SwitchName=b Switches=a', + 'SwitchName=a Nodes=m22-blue-3', + # topology "by nodeset" + "SwitchName=ns_blue Nodes=m22-blue-[4-6]", + "SwitchName=ns_green Nodes=m22-green-[0-2,4]", "SwitchName=ns_pink Nodes=m22-pink-[0-3]", + # TPU topology "SwitchName=tpu-root Switches=ns_bold,ns_slim", "SwitchName=ns_bold Switches=bold-[0-3]", "SwitchName=bold-0 Nodes=m22-bold-[0-2]", @@ -73,10 +98,18 @@ def tpu_se(ns: TstNodeset) -> TstTPU: compressed = uncompressed.compress() want_compressed = [ - "SwitchName=s0 Switches=s0_[0-2]", - "SwitchName=s0_0 Nodes=m22-blue-[0-6]", - "SwitchName=s0_1 Nodes=m22-green-[0-4]", - "SwitchName=s0_2 Nodes=m22-pink-[0-3]", + "SwitchName=s0 Switches=s0_[0-4]", # root + # "physical" topology + 'SwitchName=s0_0 Switches=s0_0_[0-1]', # /a + 'SwitchName=s0_0_0 Nodes=m22-blue-[0-1],m22-green-3', # /a/a + 'SwitchName=s0_0_1 Nodes=m22-blue-2', # /a/b + 'SwitchName=s0_1 Switches=s0_1_0', # /b + 'SwitchName=s0_1_0 Nodes=m22-blue-3', # /b/a + # topology "by nodeset" + "SwitchName=s0_2 Nodes=m22-blue-[4-6]", + "SwitchName=s0_3 Nodes=m22-green-[0-2,4]", + "SwitchName=s0_4 Nodes=m22-pink-[0-3]", + # TPU topology "SwitchName=s1 Switches=s1_[0-1]", "SwitchName=s1_0 Switches=s1_0_[0-3]", "SwitchName=s1_0_0 Nodes=m22-bold-[0-2]", @@ -86,6 +119,74 @@ def tpu_se(ns: TstNodeset) -> TstTPU: "SwitchName=s1_1 Nodes=m22-slim-[0-2]"] assert list(compressed.render_conf_lines()) == want_compressed - conf.gen_topology_conf(util.Lookup(cfg)) + assert conf.gen_topology_conf(lkp) == True want_written = PRELUDE + "\n".join(want_compressed) + "\n\n" assert open(cfg.output_dir + "/cloud_topology.conf").read() == want_written + + summary_got = json.loads(open(cfg.output_dir + "/cloud_topology.summary.json").read()) + + assert summary_got == { + "down_nodes": unordered( + [f"m22-blue-{i}" for i in (4,5,6)] + + [f"m22-green-{i}" for i in (0,1,2,4)] + + [f"m22-pink-{i}" for i in range(4)]), + "tpu_nodes": unordered( + [f"m22-bold-{i}" for i in range(9)] + + [f"m22-slim-{i}" for i in range(3)]), + 'physical_host': { + 'm22-blue-0': '/a/a/a', + 'm22-blue-1': '/a/a/b', + 'm22-blue-2': '/a/b/a', + 'm22-blue-3': '/b/a/a', + 'm22-green-3': '/a/a/c'}, + } + + + +def test_gen_topology_conf_update(): + cfg = TstCfg( + nodeset={ + "c": TstNodeset("green", node_count_static=2), + }, + output_dir=tempfile.mkdtemp(), + ) + lkp = util.Lookup(cfg) + lkp.instances = lambda: {} # no instances + + # initial generation - reconfigure + assert conf.gen_topology_conf(lkp) == True + + # add node: node_count_static 2 -> 3 - reconfigure + lkp.cfg.nodeset["c"].node_count_static = 3 + assert conf.gen_topology_conf(lkp) == True + + # remove node: node_count_static 3 -> 2 - no reconfigure + lkp.cfg.nodeset["c"].node_count_static = 2 + assert conf.gen_topology_conf(lkp) == False + + # set empty physicalHost - no reconfigure + lkp.instances = lambda: { n.name: n for n in [TstInstance("m22-green-0", physicalHost="")]} + assert conf.gen_topology_conf(lkp) == False + + # set physicalHost - reconfigure + lkp.instances = lambda: { n.name: n for n in [TstInstance("m22-green-0", physicalHost="/a/b/c")]} + assert conf.gen_topology_conf(lkp) == True + + # change physicalHost - reconfigure + lkp.instances = lambda: { n.name: n for n in [TstInstance("m22-green-0", physicalHost="/a/b/z")]} + assert conf.gen_topology_conf(lkp) == True + + # shut down node - no reconfigure + lkp.instances = lambda: {} + assert conf.gen_topology_conf(lkp) == False + + +@pytest.mark.parametrize( + "paths,expected", + [ + (["z/n-0", "z/n-1", "z/n-2", "z/n-3", "z/n-4", "z/n-10"], ['n-0', 'n-1', 'n-2', 'n-3', 'n-4', 'n-10']), + (["y/n-0", "z/n-1", "x/n-2", "x/n-3", "y/n-4", "g/n-10"], ['n-0', 'n-4', 'n-1', 'n-2', 'n-3', 'n-10']), + ]) +def test_sort_nodes_order(paths: list[list[str]], expected: list[str]) -> None: + paths = [l.split("/") for l in paths] + assert sort_nodes.order(paths) == expected diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py index f74804250a..4dd3c8a17b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/tests/test_util.py @@ -58,7 +58,7 @@ ], ) def test_node_desc(name, expected): - assert util.lkp._node_desc(name) == expected + assert util.lookup()._node_desc(name) == expected @pytest.mark.parametrize( @@ -69,7 +69,7 @@ def test_node_desc(name, expected): ) def test_node_desc_fail(name): with pytest.raises(Exception): - util.lkp._node_desc(name) + util.lookup()._node_desc(name) @pytest.mark.parametrize( diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py index b4252a2560..cb17500d90 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/util.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, List, Tuple, Optional +from typing import Iterable, List, Tuple, Optional, Any, Dict import argparse import base64 import collections @@ -38,7 +38,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from contextlib import contextmanager from functools import lru_cache, reduce, wraps -from itertools import chain, compress, islice +from itertools import chain, islice from pathlib import Path from time import sleep, time @@ -78,7 +78,6 @@ else: CONFIG_FILE = Path(__file__).with_name("config.yaml") API_REQ_LIMIT = 2000 -URI_REGEX = r"[a-z]([-a-z0-9]*[a-z0-9])?" def mkdirp(path: Path) -> None: @@ -89,10 +88,6 @@ def mkdirp(path: Path) -> None: p for p in (Path(__file__).parent, Path("/slurm/scripts")) if p.is_dir() ) -# slurm-gcp config object, could be empty if not available -cfg = NSDict() -# caching Lookup object -lkp = None # load all directories as Paths into a dict-like namespace dirs = NSDict( @@ -161,7 +156,7 @@ def universe_domain() -> str: def endpoint_version(api: ApiEndpoint) -> Optional[str]: - return lkp.endpoint_versions.get(api.value, None) + return lookup().endpoint_versions.get(api.value, None) @lru_cache(maxsize=1) @@ -271,17 +266,18 @@ def map_with_futures(func, seq): res = e yield res +def _get_bucket_and_common_prefix() -> Tuple[str, str]: + uri = instance_metadata("attributes/slurm_bucket_path") + return parse_bucket_uri(uri) def blob_get(file): - uri = instance_metadata("attributes/slurm_bucket_path") - bucket_name, path = parse_bucket_uri(uri) + bucket_name, path = _get_bucket_and_common_prefix() blob_name = f"{path}/{file}" return storage_client().get_bucket(bucket_name).blob(blob_name) def blob_list(prefix="", delimiter=None): - uri = instance_metadata("attributes/slurm_bucket_path") - bucket_name, path = parse_bucket_uri(uri) + bucket_name, path = _get_bucket_and_common_prefix() blob_prefix = f"{path}/{prefix}" # Note: The call returns a response only when the iterator is consumed. blobs = storage_client().list_blobs( @@ -304,9 +300,9 @@ def install_custom_scripts(check_hash=False): """download custom scripts from gcs bucket""" compute_tokens = ["compute", "prolog", "epilog"] - if lkp.instance_role == "compute": + if lookup().instance_role == "compute": try: - compute_tokens.append(f"nodeset-{lkp.node_nodeset_name()}") + compute_tokens.append(f"nodeset-{lookup().node_nodeset_name()}") except Exception as e: log.error(f"Failed to lookup nodeset: {e}") @@ -316,7 +312,7 @@ def install_custom_scripts(check_hash=False): "compute": compute_tokens, "controller": ["controller", "prolog", "epilog"], }, - lkp.instance_role, + lookup().instance_role, [], ) prefixes = [f"slurm-{tok}-script" for tok in prefix_tokens] @@ -341,7 +337,7 @@ def install_custom_scripts(check_hash=False): chown_slurm(dirs.custom_scripts / par) need_update = True if check_hash and fullpath.exists(): - # TODO: MD5 reported by gcloud may differ from the one calculated here (e.g. if blob got gzipped), + # TODO: MD5 reported by gcloud may differ from the one calculated here (e.g. if blob got gzipped), # consider using gCRC32C need_update = hash_file(fullpath) != blob.md5_hash if need_update: @@ -398,9 +394,14 @@ def storage_client() -> storage.Client: return storage.Client(client_options=ClientOptions(**co)) -def load_config_data(config): - """load dict-like data into a config object""" - cfg = NSDict(config) +class DeffetiveStoredConfigError(Exception): + """ + Raised when config can not be loaded and assembled from bucket + """ + pass + + +def _fill_cfg_defaults(cfg: NSDict) -> NSDict: if not cfg.slurm_log_dir: cfg.slurm_log_dir = dirs.log if not cfg.slurm_bin_dir: @@ -409,8 +410,7 @@ def load_config_data(config): cfg.slurm_control_host = f"{cfg.slurm_cluster_name}-controller" if not cfg.slurm_control_host_port: cfg.slurm_control_host_port = "6820-6830" - if not cfg.munge_mount: - # NOTE: should only happen with cloud controller + if not cfg.munge_mount: # NOTE: should only happen with cloud controller cfg.munge_mount = NSDict( { "server_ip": cfg.slurm_control_addr or cfg.slurm_control_host, @@ -419,21 +419,11 @@ def load_config_data(config): "mount_options": "defaults,hard,intr,_netdev", } ) - - if not cfg.enable_debug_logging and isinstance(cfg.enable_debug_logging, NSDict): - cfg.enable_debug_logging = False - return cfg - - -def new_config(config): - """initialize a new config object - necessary defaults are handled here - """ - cfg = load_config_data(config) - + network_storage_iter = filter( None, ( + cfg.munge_mount, *cfg.network_storage, *cfg.login_network_storage, *chain.from_iterable(ns.network_storage for ns in cfg.nodeset.values()), @@ -448,36 +438,107 @@ def new_config(config): netstore.server_ip = cfg.slurm_control_host return cfg +def _list_config_blobs() -> Tuple[Any, str]: + _, common_prefix = _get_bucket_and_common_prefix() + res = { # TODO: use a dataclass once we move to python 3.7 + "core": None, + "partition": [], + "nodeset": [], + "nodeset_dyn": [], + "nodeset_tpu": [], + } + hash = hashlib.md5() + blobs = list(blob_list(prefix="")) + # sort blobs so hash is consistent + for blob in sorted(blobs, key=lambda b: b.name): + if blob.name == f"{common_prefix}/config.yaml": + res["core"] = blob + hash.update(blob.md5_hash.encode("utf-8")) + for key in ("partition", "nodeset", "nodeset_dyn", "nodeset_tpu"): + if blob.name.startswith(f"{common_prefix}/{key}_configs/"): + res[key].append(blob) + hash.update(blob.md5_hash.encode("utf-8")) + + if res["core"] is None: + raise DeffetiveStoredConfigError("config.yaml not found in bucket") + return res, hash.hexdigest() + + +def _fetch_config(old_hash: Optional[str]) -> Optional[Tuple[NSDict, str]]: + """Fetch config from bucket, returns None if no changes are detected.""" + blobs, hash = _list_config_blobs() + if old_hash == hash: + return None + + def _download(bs) -> List[Any]: + return [yaml.safe_load(b.download_as_text()) for b in bs] + + return _assemble_config( + core=_download([blobs["core"]])[0], + partitions=_download(blobs["partition"]), + nodesets=_download(blobs["nodeset"]), + nodesets_dyn=_download(blobs["nodeset_dyn"]), + nodesets_tpu=_download(blobs["nodeset_tpu"]), + ), hash + +def _assemble_config( + core: Any, + partitions: List[Any], + nodesets: List[Any], + nodesets_dyn: List[Any], + nodesets_tpu: List[Any], + ) -> NSDict: + cfg = NSDict(core) + + # add partition configs + for p_yaml in partitions: + p_cfg = NSDict(p_yaml) + assert p_cfg.get("partition_name"), "partition_name is required" + p_name = p_cfg.partition_name + assert p_name not in cfg.partitions, f"partition {p_name} already defined" + cfg.partitions[p_name] = p_cfg + + # add nodeset configs + ns_names = set() + def _add_nodesets(yamls: List[Any], target: dict): + for ns_yaml in yamls: + ns_cfg = NSDict(ns_yaml) + assert ns_cfg.get("nodeset_name"), "nodeset_name is required" + ns_name = ns_cfg.nodeset_name + assert ns_name not in ns_names, f"nodeset {ns_name} already defined" + target[ns_name] = ns_cfg + ns_names.add(ns_name) + + _add_nodesets(nodesets, cfg.nodeset) + _add_nodesets(nodesets_dyn, cfg.nodeset_dyn) + _add_nodesets(nodesets_tpu, cfg.nodeset_tpu) + + # validate that configs for all referenced nodesets are present + for p in cfg.partitions.values(): + for ns_name in chain(p.partition_nodeset, p.partition_nodeset_dyn, p.partition_nodeset_tpu): + if ns_name not in ns_names: + raise DeffetiveStoredConfigError(f"nodeset {ns_name} not defined in config") + + return _fill_cfg_defaults(cfg) + +def fetch_config() -> Tuple[bool, NSDict]: + """ + Fetches config from bucket and saves it locally + Returns True if new (updated) config was fetched + """ + hash_file = Path("/slurm/scripts/.config.hash") + old_hash = hash_file.read_text() if hash_file.exists() else None + + cfg_and_hash = _fetch_config(old_hash=old_hash) + if not cfg_and_hash: + return False, _load_config() -def fetch_config_yaml(): - """Fetch config.yaml from bucket""" - config_yaml = blob_get("config.yaml").download_as_text() - cfg = new_config(yaml.safe_load(config_yaml)) - return cfg - - -def fetch_config_yaml_md5(): - """Fetch config.yaml blob md5 from bucket""" - blob = blob_get("config.yaml") - blob.reload() # Populate blob with metadata - hash_str = str(blob.md5_hash).encode(encoding="utf-8") - return hashlib.md5(hash_str) - - -def load_config_file(path): - """load config from file""" - content = None - try: - content = yaml.safe_load(Path(path).read_text()) - except FileNotFoundError: - log.warning(f"config file not found: {path}") - return NSDict() - return load_config_data(content) - - -def save_config(cfg, path): - """save given config to file at path""" - Path(path).write_text(yaml.dump(cfg, Dumper=Dumper)) + cfg, hash = cfg_and_hash + hash_file.write_text(hash) + chown_slurm(hash_file) + CONFIG_FILE.write_text(yaml.dump(cfg, Dumper=Dumper)) + chown_slurm(CONFIG_FILE) + return True, cfg def owned_file_handler(filename): """create file handler""" @@ -489,7 +550,8 @@ def get_log_path() -> Path: Returns path to log file for the current script. e.g. resume.py -> /var/log/slurm/resume.log """ - log_dir = Path(cfg.slurm_log_dir or ".") + cfg_log_dir = lookup().cfg.slurm_log_dir + log_dir = Path(cfg_log_dir) if cfg_log_dir else dirs.log return (log_dir / Path(sys.argv[0]).name).with_suffix(".log") def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: @@ -509,13 +571,11 @@ def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: help="Enable detailed api request output", ) args = parser.parse_args() - loglevel = args.loglevel - if cfg.enable_debug_logging: + if lookup().cfg.enable_debug_logging: loglevel = logging.DEBUG if args.trace_api: - cfg.extra_logging_flags["trace_api"] = True - + lookup().cfg.extra_logging_flags["trace_api"] = True # Configure root logger logging.config.dictConfig({ "version": 1, @@ -555,9 +615,8 @@ def init_log_and_parse(parser: argparse.ArgumentParser) -> argparse.Namespace: def log_api_request(request): """log.trace info about a compute API request""" - if not cfg.extra_logging_flags.get("trace_api"): + if not lookup().cfg.extra_logging_flags.get("trace_api"): return - # output the whole request object as pretty yaml # the body is nested json, so load it as well rep = json.loads(request.to_json()) @@ -834,7 +893,7 @@ def to_hostlist(nodenames) -> str: tmp_file.writelines("\n".join(sorted(nodenames, key=natural_sort))) tmp_file.close() - hostlist = run(f"{lkp.scontrol} show hostlist {tmp_file.name}").stdout.rstrip() + hostlist = run(f"{lookup().scontrol} show hostlist {tmp_file.name}").stdout.rstrip() os.remove(tmp_file.name) return hostlist @@ -899,7 +958,7 @@ def cur_repr(): def part_is_tpu(part): """check if partition with name part contains a nodeset of type tpu""" - return len(lkp.cfg.partitions[part].partition_nodeset_tpu) > 0 + return len(lookup().cfg.partitions[part].partition_nodeset_tpu) > 0 def to_hostnames(nodelist: str) -> List[str]: """make list of hostnames from hostlist expression""" @@ -909,7 +968,7 @@ def to_hostnames(nodelist: str) -> List[str]: hostlist = nodelist else: hostlist = ",".join(nodelist) - hostnames = run(f"{lkp.scontrol} show hostnames {hostlist}").stdout.splitlines() + hostnames = run(f"{lookup().scontrol} show hostnames {hostlist}").stdout.splitlines() return hostnames @@ -974,7 +1033,7 @@ def batch_callback(rid, resp, exc): done[rid] = resp def batch_request(reqs): - batch = lkp.compute.new_batch_http_request(callback=batch_callback) + batch = lookup().compute.new_batch_http_request(callback=batch_callback) for rid, req in reqs: batch.add(req, request_id=rid) return batch @@ -1009,19 +1068,19 @@ def batch_request(reqs): def wait_request(operation, project: str): """makes the appropriate wait request for a given operation""" if "zone" in operation: - req = lkp.compute.zoneOperations().wait( + req = lookup().compute.zoneOperations().wait( project=project, zone=trim_self_link(operation["zone"]), operation=operation["name"], ) elif "region" in operation: - req = lkp.compute.regionOperations().wait( + req = lookup().compute.regionOperations().wait( project=project, region=trim_self_link(operation["region"]), operation=operation["name"], ) else: - req = lkp.compute.globalOperations().wait( + req = lookup().compute.globalOperations().wait( project=project, operation=operation["name"] ) return req @@ -1050,7 +1109,7 @@ def wait_for_operations(operations): def get_filtered_operations(op_filter): """get list of operations associated with group id""" - project = lkp.project + project = lookup().project operations = [] def get_aggregated_operations(items): @@ -1061,7 +1120,7 @@ def get_aggregated_operations(items): ) ) - act = lkp.compute.globalOperations() + act = lookup().compute.globalOperations() op = act.aggregatedList( project=project, filter=op_filter, fields="items.*.operations,nextPageToken" ) @@ -1084,55 +1143,39 @@ def get_insert_operations(group_ids): return get_filtered_operations(" AND ".join(f"({f})" for f in filters if f)) -def machine_type_sockets(template): - pattern = re.compile("^(?P[^-]+)") - m = pattern.match(template.machineType) - if not m: - raise Exception(f"template {template} does not match expected regex") - family = m.group("family") - guestCpus: int = int(template.machine_info.guestCpus) - socket_count = dict.get( - { - "h3": 2, - "c2d": 2 if guestCpus > 56 else 1, - "a3": 2, - }, - family, - 1, # assume 1 socket for all other families - ) - return socket_count +def machine_type_family(mt: str) -> str: + """get machine type family from machine type""" + # TODO: doesn't work with N1 custom machine types + # See https://cloud.google.com/compute/docs/instances/creating-instance-with-custom-machine-type#create + return mt.split("-")[0] -def isSmt(template): - machineType: str = template.machineType +def machine_type_sockets(template) -> int: guestCpus: int = int(template.machine_info.guestCpus) + return { + "h3": 2, + "c2d": 2 if guestCpus > 56 else 1, + "a3": 2, + }.get( + machine_type_family(template.machineType), + 1, # assume 1 socket for all other families + ) - pattern = re.compile("^(?P[^-]+)") - matches = pattern.match(machineType) - machineTypeFamily: str = matches["family"] +def isSmt(template) -> bool: # https://cloud.google.com/compute/docs/cpu-platforms - noSmtFamily = [ - "t2a", - "t2d", - "h3", - ] - if machineTypeFamily in noSmtFamily: + noSmtFamily = ("t2a", "t2d", "h3",) + if machine_type_family(template.machineType) in noSmtFamily: return False - elif guestCpus == 1: + if template.machine_info.guestCpus == 1: return False return True -def getThreadsPerCore(template): - threadsPerCore: int = template.advancedMachineFeatures.threadsPerCore - +def getThreadsPerCore(template) -> int: if not isSmt(template): return 1 - elif threadsPerCore: - return threadsPerCore - else: - return 2 + return template.advancedMachineFeatures.threadsPerCore or 2 @retry( @@ -1184,7 +1227,7 @@ def __init__(self, nodeset): if not can_tpu: raise Exception("TPU pip package not installed") self._nodeset = nodeset - self._parent = f"projects/{lkp.project}/locations/{nodeset.zone}" + self._parent = f"projects/{lookup().project}/locations/{nodeset.zone}" co = create_client_options(ApiEndpoint.TPU) self._client = tpu.TpuClient(client_options=co) self.data_disks = [] @@ -1312,7 +1355,7 @@ def get_node(self, nodename): def _register_node(self, nodename, ip_addr): dns_name = socket.getnameinfo((ip_addr, 0), 0)[0] run( - f"{lkp.scontrol} update nodename={nodename} nodeaddr={ip_addr} nodehostname={dns_name}" + f"{lookup().scontrol} update nodename={nodename} nodeaddr={ip_addr} nodehostname={dns_name}" ) def create_node(self, nodename): @@ -1337,7 +1380,7 @@ def create_node(self, nodename): echo "startup script not found > /var/log/startup_error.log" """ with open( - Path(cfg.slurm_scripts_dir or dirs.scripts) / "startup.sh", "r" + Path(lookup().cfg.slurm_scripts_dir or dirs.scripts) / "startup.sh", "r" ) as script: startup_script = script.read() if isinstance(nodename, list): @@ -1354,12 +1397,12 @@ def create_node(self, nodename): "slurm_docker_image": self.nodeset.docker_image, "startup-script": startup_script, "slurm_instance_role": "compute", - "slurm_cluster_name": lkp.cfg.slurm_cluster_name, - "slurm_bucket_path": lkp.cfg.bucket_path, + "slurm_cluster_name": lookup().cfg.slurm_cluster_name, + "slurm_bucket_path": lookup().cfg.bucket_path, "slurm_names": ";".join(slurm_names), "universe_domain": universe_domain(), } - node.tags = [lkp.cfg.slurm_cluster_name] + node.tags = [lookup().cfg.slurm_cluster_name] if self.nodeset.service_account: node.service_account.email = self.nodeset.service_account.email node.service_account.scope = self.nodeset.service_account.scopes @@ -1399,7 +1442,7 @@ def delete_node(self, nodename): # not been found, and if it ends in 0, it means that is the master node and it should have been found, and in consequence # log an error nodehostname = yaml.safe_load( - run(f"{lkp.scontrol} --yaml show node {nodename}").stdout.rstrip() + run(f"{lookup().scontrol} --yaml show node {nodename}").stdout.rstrip() )["nodes"][0]["hostname"] if nodehostname.split("-")[-1] == "0": log.error(f"TPU master node {nodename} not found") @@ -1412,8 +1455,8 @@ def delete_node(self, nodename): class Lookup: """Wrapper class for cached data access""" - def __init__(self, cfg=None): - self._cfg = cfg or NSDict() + def __init__(self, cfg): + self._cfg = cfg self.template_cache_path = Path(__file__).parent / "template_info.cache" @property @@ -1446,7 +1489,7 @@ def endpoint_versions(self): @property def scontrol(self): - return Path(self.cfg.slurm_bin_dir if cfg else "") / "scontrol" + return Path(self.cfg.slurm_bin_dir or "") / "scontrol" @cached_property def instance_role(self): @@ -1612,9 +1655,7 @@ def slurm_node(self, nodename): return self.slurm_nodes().get(nodename) @lru_cache(maxsize=1) - def instances(self, project=None, slurm_cluster_name=None): - slurm_cluster_name = slurm_cluster_name or self.cfg.slurm_cluster_name - project = project or self.project + def instances(self) -> Dict[str, object]: instance_information_fields = [ "advancedMachineFeatures", "cpuPlatform", @@ -1649,19 +1690,25 @@ def instances(self, project=None, slurm_cluster_name=None): # "deletionProtection", # "startRestricted", ] - if lkp.cfg.enable_slurm_gcp_plugins: + if lookup().cfg.enable_slurm_gcp_plugins: slurm_gcp_plugins.register_instance_information_fields( - lkp=lkp, - project=project, - slurm_cluster_name=slurm_cluster_name, + lkp=lookup(), + project=self.project, + slurm_cluster_name=self.cfg.slurm_cluster_name, instance_information_fields=instance_information_fields, ) + + # TODO: Merge this with all fields when upcoming maintenance is + # supported in beta. + if endpoint_version(ApiEndpoint.COMPUTE) == 'alpha': + instance_information_fields.append("upcomingMaintenance") + instance_information_fields = sorted(set(instance_information_fields)) instance_fields = ",".join(instance_information_fields) fields = f"items.zones.instances({instance_fields}),nextPageToken" - flt = f"labels.slurm_cluster_name={slurm_cluster_name} AND name:{slurm_cluster_name}-*" + flt = f"labels.slurm_cluster_name={self.cfg.slurm_cluster_name} AND name:{self.cfg.slurm_cluster_name}-*" act = self.compute.instances() - op = act.aggregatedList(project=project, fields=fields, filter=flt) + op = act.aggregatedList(project=self.project, fields=fields, filter=flt) def properties(inst): """change instance properties to a preferred format""" @@ -1683,7 +1730,7 @@ def properties(inst): instance_iter = ( (inst["name"], properties(inst)) for inst in chain.from_iterable( - m["instances"] for m in result.get("items", {}).values() + zone.get("instances", []) for zone in result.get("items", {}).values() ) ) instances.update( @@ -1692,11 +1739,8 @@ def properties(inst): op = act.aggregatedList_next(op, result) return instances - def instance(self, instance_name, project=None, slurm_cluster_name=None): - instances = self.instances( - project=project, slurm_cluster_name=slurm_cluster_name - ) - return instances.get(instance_name) + def instance(self, instance_name: str) -> Optional[object]: + return self.instances().get(instance_name) @lru_cache() def reservation(self, name: str, zone: str) -> object: @@ -1715,20 +1759,17 @@ def reservation(self, name: str, zone: str) -> object: ) @lru_cache(maxsize=1) - def machine_types(self, project=None): - project = project or self.project + def machine_types(self): field_names = "name,zone,guestCpus,memoryMb,accelerators" fields = f"items.zones.machineTypes({field_names}),nextPageToken" machines = defaultdict(dict) act = self.compute.machineTypes() - op = act.aggregatedList(project=project, fields=fields) + op = act.aggregatedList(project=self.project, fields=fields) while op is not None: result = ensure_execute(op) machine_iter = chain.from_iterable( - m["machineTypes"] - for m in result["items"].values() - if "machineTypes" in m + scope.get("machineTypes", []) for scope in result["items"].values() ) for machine in machine_iter: name = machine["name"] @@ -1738,20 +1779,13 @@ def machine_types(self, project=None): op = act.aggregatedList_next(op, result) return machines - def machine_type(self, machine_type, project=None, zone=None): + def machine_type(self, machine_type: str): """ """ custom_patt = re.compile( r"((?P\w+)-)?custom-(?P\d+)-(?P\d+)" ) custom_match = custom_patt.match(machine_type) - if zone: - project = project or self.project - machine_info = ensure_execute( - self.compute.machineTypes().get( - project=project, zone=zone, machineType=machine_type - ) - ) - elif custom_match is not None: + if custom_match is not None: groups = custom_match.groupdict() cpus, mem = (groups[k] for k in ["cpus", "mem"]) machine_info = { @@ -1759,18 +1793,20 @@ def machine_type(self, machine_type, project=None, zone=None): "memoryMb": int(mem), } else: - machines = self.machine_types(project=project) - machine_info = next(iter(machines[machine_type].values()), None) - if machine_info is None: + machines = self.machine_types() + if machine_type not in machines: raise Exception(f"machine type {machine_type} not found") + per_zone = machines[machine_type] + assert per_zone + machine_info = next(iter(per_zone.values())) # pick the first/any zone return NSDict(machine_info) - def template_machine_conf(self, template_link, project=None, zone=None): + def template_machine_conf(self, template_link): template = self.template_info(template_link) if not template.machineType: temp_name = trim_self_link(template_link) raise Exception(f"instance template {temp_name} has no machine type") - template.machine_info = self.machine_type(template.machineType, zone=zone) + template.machine_info = self.machine_type(template.machineType) machine = template.machine_info machine_conf = NSDict() @@ -1816,8 +1852,7 @@ def template_cache(self, writeback=False): cache.close() @lru_cache(maxsize=None) - def template_info(self, template_link, project=None): - project = project or self.project + def template_info(self, template_link): template_name = trim_self_link(template_link) # split read and write access to minimize write-lock. This might be a # bit slower? TODO measure @@ -1828,7 +1863,7 @@ def template_info(self, template_link, project=None): template = ensure_execute( self.compute.instanceTemplates().get( - project=project, instanceTemplate=template_name + project=self.project, instanceTemplate=template_name ) ).get("properties") template = NSDict(template) @@ -1839,7 +1874,7 @@ def template_info(self, template_link, project=None): # del template.metadata # translate gpus into an easier-to-read format - machine_info = self.machine_type(template.machineType, project=project) + machine_info = self.machine_type(template.machineType) if machine_info.accelerators: template.gpu_type = machine_info.accelerators[0].guestAcceleratorType template.gpu_count = machine_info.accelerators[0].guestAcceleratorCount @@ -1869,19 +1904,26 @@ def nodeset_map(self, hostnames: list): def etc_dir(self) -> Path: return Path(self.cfg.output_dir or slurmdirs.etc) +_lkp: Optional[Lookup] = None + +def _load_config() -> NSDict: + return NSDict(yaml.safe_load(CONFIG_FILE.read_text())) + +def lookup() -> Lookup: + global _lkp + if _lkp is None: + try: + cfg = _load_config() + except FileNotFoundError: + log.error(f"config file not found: {CONFIG_FILE}") + cfg = NSDict() # TODO: fail here, once all code paths are covered (mainly init_logging) + _lkp = Lookup(cfg) + return _lkp + +def update_config(cfg: NSDict) -> None: + global _lkp + _lkp = Lookup(cfg) + def scontrol_reconfigure(lkp: Lookup) -> None: log.info("Running scontrol reconfigure") run(f"{lkp.scontrol} reconfigure", timeout=30) - -# Define late globals -lkp = Lookup() -cfg = load_config_file(CONFIG_FILE) -if not cfg: - try: - cfg = fetch_config_yaml() - except Exception as e: - log.warning(f"config not found in bucket: {e}") - if cfg: - save_config(cfg, CONFIG_FILE) - -lkp = Lookup(cfg) diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf index bc0a57a486..2c01b6b579 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/variables.tf @@ -272,11 +272,11 @@ EOD variable "login_network_storage" { description = < " - exit 1 -fi - -if [[ -n "${gcloud_dir}" ]]; then - export PATH="$gcloud_dir:$PATH" -fi - -API_ENDPOINT="CLOUDSDK_API_ENDPOINT_OVERRIDES_COMPUTE=https://www.${universe_domain}/compute/${compute_endpoint_version}/" - -if ! type -P gcloud 1>/dev/null; then - echo "gcloud is not available and your compute resources are not being cleaned up" - echo "https://console.cloud.google.com/compute/instances?project=${project}" - exit 1 -fi - -echo "Deleting compute nodes" -node_filter="labels.slurm_cluster_name=${cluster_name} AND labels.slurm_instance_role=compute" -while true; do - nodes=$(bash -c "$API_ENDPOINT gcloud compute instances list --project \"${project}\" --format=\"value(selfLink)\" --filter=\"${node_filter}\" --limit=10 | paste -sd \" \" -") - if [[ -z "${nodes}" ]]; then - break - fi - # The lack of quotes is intentional and causes each new space-separated "word" to - # be treated as independent arguments. See PR#2523 - # shellcheck disable=SC2086 - bash -c "$API_ENDPOINT gcloud compute instances delete --quiet ${nodes}" -done - -echo "Deleting resource policies" -policies_filter="name:${cluster_name}-*" -gcloud compute resource-policies list --project "${project}" --format="value(selfLink)" --filter="${policies_filter}" | while read -r line; do - echo "Deleting resource policy: $line" - gcloud compute resource-policies delete --quiet "${line}" || { - echo "Failed to delete resource policy: $line" - } -done diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf index ecdb4b22c3..26dfd21d49 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/variables.tf @@ -382,7 +382,7 @@ Enables automatic cleanup of compute nodes and resource policies (e.g. placement groups) managed by this module, when cluster is destroyed. *WARNING*: Toggling this off will impact the running workload. -Deployed compute nodes and controller will be destroyed. +Deployed compute nodes will be destroyed. EOD type = bool default = true @@ -407,6 +407,7 @@ variable "cloud_parameters" { suspend_rate = optional(number) suspend_timeout = optional(number) topology_plugin = optional(string) + topology_param = optional(string) tree_width = optional(number) }) default = {} @@ -588,12 +589,17 @@ Use this database instead of the one on the controller. user : The user to access the database as. password : The password, given the user, to access the given database. (sensitive) db_name : The database to access. + user_managed_replication : The list of location and (optional) kms_key_name for secret EOD type = object({ server_ip = string user = string password = string # sensitive db_name = string + user_managed_replication = optional(list(object({ + location = string + kms_key_name = optional(string) + })), []) }) default = null sensitive = true diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf index 7ab1c46f14..f9ad93d88b 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/versions.tf @@ -22,14 +22,8 @@ terraform { source = "hashicorp/google" version = ">= 4.84" } - - null = { - source = "hashicorp/null" - version = ">= 3.0" - } - } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-controller/v1.39.0" } } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf index 59e73842cf..4b18af8439 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-login/versions.tf @@ -24,6 +24,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v6-login/v1.39.0" } } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index be6f5e82c9..5f7ffc9614 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.39.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index e4e02e4151..678429f568 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.39.0" } required_version = ">= 0.14.0" diff --git a/docs/blueprint-validation.md b/docs/blueprint-validation.md index c6f02032ac..20c4fef115 100644 --- a/docs/blueprint-validation.md +++ b/docs/blueprint-validation.md @@ -112,7 +112,7 @@ validators: * Use `skip-validators` CLI flag: ```shell -./ghpc create ... --skip-validators="test_project_exists,test_apis_enabled" +./gcluster create ... --skip-validators="test_project_exists,test_apis_enabled" ``` * To disable all validators, set the [validation level to IGNORE](#validation-levels). @@ -134,12 +134,12 @@ They can also be set to 3 differing levels of behavior using the command-line For example, this command will set all validators to `WARNING` behavior: ```shell -./ghpc create --validation-level WARNING examples/hpc-slurm.yaml +./gcluster create --validation-level WARNING examples/hpc-slurm.yaml ``` The flag can be shortened to `-l` as shown below using `IGNORE` to disable all validators. ```shell -./ghpc create -l IGNORE examples/hpc-slurm.yaml +./gcluster create -l IGNORE examples/hpc-slurm.yaml ``` diff --git a/docs/hpc-slurm6-tpu-maxtext.md b/docs/hpc-slurm6-tpu-maxtext.md index c13ca2eab6..4126ac4b19 100644 --- a/docs/hpc-slurm6-tpu-maxtext.md +++ b/docs/hpc-slurm6-tpu-maxtext.md @@ -15,8 +15,8 @@ the dataset in your GCS bucket. After that you can update the blueprint to use t dataset from GCS bucket in training script. ```bash -./ghpc create community/examples/hpc-slurm6-tpu-maxtext.yaml --vars project_id=; -./ghpc deploy slurm6-tpu-v4 --auto-approve +./gcluster create community/examples/hpc-slurm6-tpu-maxtext.yaml --vars project_id=; +./gcluster deploy slurm6-tpu-v4 --auto-approve ``` This would deploy slurm cluster with TPU partition, dynamic compute partition. Maxtext benchmark test script @@ -79,7 +79,7 @@ For this we need to return to our cloud shell terminal. Run exit in the terminal Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc destroy slurm6-tpu-v4 --auto-approve +./gcluster destroy slurm6-tpu-v4 --auto-approve ``` When complete you should see something like: diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index 338ff814f9..fbc851e1db 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -140,14 +140,14 @@ command to install the pip packages outlined in pip install -r docs/hybrid-slurm-cluster/requirements.txt ``` -#### Build ghpc +#### Build gcluster -Before you begin, ensure that you have built the `ghpc` tool in the Cluster Toolkit. +Before you begin, ensure that you have built the `gcluster` tool in the Cluster Toolkit. For more information see the [README.md](../../README.md#quickstart) Quickstart. -The commands in these instructions assume the ghpc binary is installed in a +The commands in these instructions assume the gcluster binary is installed in a directory represented in the PATH environment variable. To ensure this is the -case, run `make install` after building `ghpc`: +case, run `make install` after building `gcluster`: ```shell make @@ -166,10 +166,10 @@ blueprint will do the following: * Create a subnetwork of `compute-vpc-network` named `primary-subnet` with an internal IP range of 10.1.0.0/16 -Create a deployment directory for the networks using `ghpc`: +Create a deployment directory for the networks using `gcluster`: ```shell -ghpc create docs/hybrid-slurm-cluster/blueprints/create-networks.yaml --vars project_id="<>",project_id_compute="<>" +gcluster create docs/hybrid-slurm-cluster/blueprints/create-networks.yaml --vars project_id="<>",project_id_compute="<>" ``` If successful, this command will provide 3 terraform operations that can be @@ -299,7 +299,7 @@ First, use the Cluster Toolkit to create the deployment directory, replacing "<>" with the ID of your project A: ```shell -ghpc create docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml --vars project_id="<>" +gcluster create docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml --vars project_id="<>" ``` If successful, this command will provide 3 terraform operations that can be diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index 6d29b916a7..b03f5403a1 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -77,7 +77,7 @@ command line, run the following command with the updated values for `<>` and `<>`: ```shell -./ghpc create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml \ +./gcluster create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml \ --vars project_id=<> \ --vars static_controller_hostname=<> \ --vars static_controller_addr=<> @@ -87,10 +87,10 @@ If successful, this command will create a deployment folder. Use the following command to deploy the hybrid configuration: ```sh -./ghpc deploy hybrid-config +./gcluster deploy hybrid-config ``` -`ghpc` reports the changes that Terraform is proposing to make for your +`gcluster` reports the changes that Terraform is proposing to make for your cluster. Optionally, you may review them by typing `d` and pressing `enter`. To deploy the cluster, accept the proposed changes by typing `a` and pressing `enter`. diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 3a4c3370b3..30e721dad0 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -142,14 +142,14 @@ a service account, see the [Setup Authentication](#setup-authentication) section and the Google Cloud documentation on [Service Accounts](https://cloud.google.com/iam/docs/service-accounts). -#### Build ghpc +#### Build gcluster -Before you begin, ensure that you have built the `ghpc` tool in the Cluster Toolkit. +Before you begin, ensure that you have built the `gcluster` tool in the Cluster Toolkit. For more information see the [README.md](../../README.md#quickstart) Quickstart. -The commands in these instructions assume the ghpc binary is installed in a +The commands in these instructions assume the gcluster binary is installed in a directory represented in the PATH environment variable. To ensure this is the -case, run `make install` after building `ghpc`: +case, run `make install` after building `gcluster`: ```shell make diff --git a/docs/slurm-dws-flex.md b/docs/slurm-dws-flex.md new file mode 100644 index 0000000000..ffea0bec16 --- /dev/null +++ b/docs/slurm-dws-flex.md @@ -0,0 +1,32 @@ +# Obtaining SlurmGCP nodes with DWS Flex + +[Dynamic Workload Scheduler](https://cloud.google.com/blog/products/compute/introducing-dynamic-workload-scheduler) Flex Start mode is designed for fine-tuning models, experimentation, shorter training jobs, distillation, offline inference, and batch jobs. + +With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity request for your AI/ML jobs by indicating how many you need, a duration, and your preferred region. It supports capacity requests for up to seven days, with no minimum duration requirement. You can request capacity for as little as a few minutes or hours; typically, the scheduler can fulfill shorter requests more quickly than longer ones. + +> [!IMPORTANT] +> The project needs to be allowlisted for private preview access. +> Fill out the [form](https://docs.google.com/forms/d/1etaaXMW9jJUTTxfUC7TIIMttLWT5H-3Q8_3-sG6vwKk/edit). + +In order to make use of DWS Flex Start mode with SlurmGCP, you must specify a proper set of `instance_properties` in the `schedmd-slurm-gcp-v6-nodeset` module. See the example below: + +```yaml + - id: flex_nodeset + source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset + use: [network] + settings: + instance_properties: + reservationAffinity: + consumeReservationType: NO_RESERVATION + scheduling: + maxRunDuration: { seconds: $(2 * 60 * 60) } # 2 hours + onHostMaintenance: TERMINATE + instanceTerminationAction: DELETE + # the rest of the settings, e.g. node_count_static, machine_type, additional_disks, etc. +``` + +**All** fields in `instance_properties` should match provided values, except for `maxRunDuration`, which should be set to the desired duration in seconds (up to 604800 = 7 days). + +> [!WARNING] +> The use of the `instance_properties` setting directly overrides bulkInsert API parameters. While the documented sample +> was tested at the time of publication, it is not regression tested and may cease to work based on changes in the bulkInsert API. diff --git a/docs/tutorials/gromacs/spack-gromacs.md b/docs/tutorials/gromacs/spack-gromacs.md index eae4e7b5d8..fc4797db7a 100644 --- a/docs/tutorials/gromacs/spack-gromacs.md +++ b/docs/tutorials/gromacs/spack-gromacs.md @@ -59,11 +59,11 @@ To build Cluster Toolkit binary from source run: make ``` -You should now have a binary named ghpc in the current directory. To verify the +You should now have a binary named gcluster in the current directory. To verify the build run: ```bash -./ghpc --version +./gcluster --version ``` This should show you the version of the Cluster Toolkit you are using. @@ -91,11 +91,11 @@ This file describes the cluster you will deploy. It defines: [This diagram](https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/docs/tutorials#blueprint-diagram-for-application-tutorials) shows how the different modules relate to each other. -After you have inspected the file, use the ghpc binary to create a deployment +After you have inspected the file, use the gcluster binary to create a deployment folder by running: ```bash -./ghpc create docs/tutorials/gromacs/spack-gromacs.yaml --vars project_id= +./gcluster create docs/tutorials/gromacs/spack-gromacs.yaml --vars project_id= ``` > **_NOTE:_** The `--vars` argument is used to override `project_id` in the @@ -109,7 +109,7 @@ contains the terraform needed to deploy your cluster. Use below command to deploy your cluster. ```bash -./ghpc deploy spack-gromacs +./gcluster deploy spack-gromacs ``` After the deployment is finished, you should see below message. @@ -271,7 +271,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc destroy spack-gromacs +./gcluster destroy spack-gromacs ``` When complete you should see something like: diff --git a/docs/tutorials/htcondor.md b/docs/tutorials/htcondor.md index 6399d25766..6c9657b69a 100644 --- a/docs/tutorials/htcondor.md +++ b/docs/tutorials/htcondor.md @@ -34,16 +34,16 @@ To build Cluster Toolkit binary from source run: make ``` -You should now have a binary named ghpc in the current directory. To verify the +You should now have a binary named gcluster in the current directory. To verify the build run: ```bash -./ghpc --version +./gcluster --version ``` This should show you the version of the Cluster Toolkit you are using. -(Optional) To install the `ghpc` binary in your home directory under bin, +(Optional) To install the `gcluster` binary in your home directory under bin, run the following command: ```bash @@ -70,10 +70,10 @@ The blueprint `community/examples/htc-htcondor.yaml` should be open in the Cloud Shell Editor (on the left). This file describes the cluster you will deploy. After you have inspected the -file, use the ghpc binary to create a deployment directory by running: +file, use the gcluster binary to create a deployment directory by running: ```bash -./ghpc create community/examples/htc-htcondor.yaml --vars "project_id=" +./gcluster create community/examples/htc-htcondor.yaml --vars "project_id=" ``` > **_NOTE:_** The `--vars` argument is used to override `project_id` in the @@ -89,7 +89,7 @@ contains the terraform needed to deploy your cluster. Use the following commands to run terraform and deploy your cluster. ```bash -./ghpc deploy htcondor-pool --auto-approve +./gcluster deploy htcondor-pool --auto-approve ``` The Toolkit will automatically approve provisioning a network, building a VM @@ -222,7 +222,7 @@ You should be returned to the Cloud Shell console. You may then destroy your HTCondor pool: ```bash -./ghpc destroy htcondor-pool --auto-approve +./gcluster destroy htcondor-pool --auto-approve ``` When complete you should see output similar to: diff --git a/docs/tutorials/openfoam/spack-openfoam.md b/docs/tutorials/openfoam/spack-openfoam.md index d872e1e436..d9bf3ea6d5 100644 --- a/docs/tutorials/openfoam/spack-openfoam.md +++ b/docs/tutorials/openfoam/spack-openfoam.md @@ -31,7 +31,7 @@ Once you have selected a project, click START. ## Enable APIs & Permissions In a new Google Cloud project there are several apis that must be enabled to -deploy your HPC cluster. These will be caught when you perform `./ghpc create` +deploy your HPC cluster. These will be caught when you perform `./gcluster create` but you can save time by enabling them now by running: @@ -59,11 +59,11 @@ To build Cluster Toolkit binary from source run: make ``` -You should now have a binary named ghpc in the current directory. To verify the +You should now have a binary named gcluster in the current directory. To verify the build run: ```bash -./ghpc --version +./gcluster --version ``` This should show you the version of the Cluster Toolkit you are using. @@ -88,11 +88,11 @@ This file describes the cluster you will deploy. It defines: * a Slurm controller * An auto-scaling Slurm partition -After you have inspected the file, use the ghpc binary to create a deployment +After you have inspected the file, use the gcluster binary to create a deployment folder by running: ```bash -./ghpc create docs/tutorials/openfoam/spack-openfoam.yaml --vars project_id= +./gcluster create docs/tutorials/openfoam/spack-openfoam.yaml --vars project_id= ``` > **_NOTE:_** The `--vars` argument is used to override `project_id` in the @@ -106,7 +106,7 @@ contains the terraform needed to deploy your cluster. Use below command to deploy your cluster. ```bash -./ghpc deploy spack-openfoam +./gcluster deploy spack-openfoam ``` You can also use below command to generate a _plan_ that describes the Google @@ -272,7 +272,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc destroy spack-openfoam +./gcluster destroy spack-openfoam ``` When complete you should see something like: diff --git a/docs/tutorials/wrfv3/spack-wrfv3.md b/docs/tutorials/wrfv3/spack-wrfv3.md index cf8dcdb334..2b03ffe66b 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.md +++ b/docs/tutorials/wrfv3/spack-wrfv3.md @@ -59,11 +59,11 @@ To build Cluster Toolkit binary from source run: make ``` -You should now have a binary named ghpc in the current directory. To verify the +You should now have a binary named gcluster in the current directory. To verify the build run: ```bash -./ghpc --version +./gcluster --version ``` This should show you the version of the Cluster Toolkit you are using. @@ -91,11 +91,11 @@ This file describes the cluster you will deploy. It defines: [This diagram](../README.md#blueprint-diagram-for-application-tutorials) shows how the different modules relate to each other. -After you have inspected the file, use the ghpc binary to create a deployment +After you have inspected the file, use the gcluster binary to create a deployment folder by running: ```bash -./ghpc create docs/tutorials/wrfv3/spack-wrfv3.yaml --vars project_id= +./gcluster create docs/tutorials/wrfv3/spack-wrfv3.yaml --vars project_id= ``` > **_NOTE:_** The `--vars` argument is used to override `project_id` in the @@ -109,7 +109,7 @@ contains the terraform needed to deploy your cluster. Use below command to deploy your cluster. ```bash -./ghpc deploy spack-wrfv3 +./gcluster deploy spack-wrfv3 ``` You can also use below command to generate a plan that describes the Google Cloud resources that will be deployed. @@ -274,7 +274,7 @@ exit Run the following command in the cloud shell terminal to destroy the cluster: ```bash -./ghpc destroy spack-wrfv3 +./gcluster destroy spack-wrfv3 ``` When complete you should see something like: diff --git a/examples/README.md b/examples/README.md index b10c2c7b74..e0e9ed9b94 100644 --- a/examples/README.md +++ b/examples/README.md @@ -52,6 +52,8 @@ md_toc github examples/README.md | sed -e "s/\s-\s/ * /" * [hpc-gke.yaml](#hpc-gkeyaml--) ![community-badge] ![experimental-badge] * [ml-gke](#ml-gkeyaml--) ![community-badge] ![experimental-badge] * [storage-gke](#storage-gkeyaml--) ![community-badge] ![experimental-badge] + * [gke-a3-megagpu](#gke-a3-megagpuyaml--) ![community-badge] ![experimental-badge] + * [gke-a3-highgpu](#gke-a3-highgpuyaml--) ![community-badge] ![experimental-badge] * [htc-slurm-v5-legacy.yaml](#htc-slurm-v5-legacyyaml--) ![community-badge] ![experimental-badge] * [htc-slurm.yaml](#htc-slurmyaml-) ![community-badge] * [htc-htcondor.yaml](#htc-htcondoryaml--) ![community-badge] ![experimental-badge] @@ -124,7 +126,7 @@ You can set the configuration using the CLI in the `create` and `expand` subcommands as well: ```shell -./ghpc create examples/hpc-slurm.yaml \ +./gcluster create examples/hpc-slurm.yaml \ --vars "project_id=${GOOGLE_CLOUD_PROJECT}" \ --backend-config "bucket=${GCS_BUCKET}" ``` @@ -166,7 +168,7 @@ as follows: * Robust reconfiguration - Reconfiguration is now managed by a service that runs on each instance. This has removed the dependency on the Pub/Sub Google cloud service, and provides a more consistent reconfiguration experience (when calling `ghpc deploy blueprint.yaml -w`). Reconfiguration has also been enabled by default. + Reconfiguration is now managed by a service that runs on each instance. This has removed the dependency on the Pub/Sub Google cloud service, and provides a more consistent reconfiguration experience (when calling `gcluster deploy blueprint.yaml -w`). Reconfiguration has also been enabled by default. * Faster deployments @@ -178,7 +180,7 @@ as follows: * Fewer dependencies in the deployment environment - Reconfiguration and compute node cleanup no longer require users to install local python dependencies in the deployment environment (where ghpc is called). This has allowed for these features to be enabled by default. + Reconfiguration and compute node cleanup no longer require users to install local python dependencies in the deployment environment (where gcluster is called). This has allowed for these features to be enabled by default. * Flexible node to partition relation @@ -565,8 +567,8 @@ VM. The cluster has 2 partitions: To provision the cluster, please run: ```text -./ghpc create examples/ml-slurm-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" -./ghpc deploy ml-example +./gcluster create examples/ml-slurm-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./gcluster deploy ml-example ``` After accessing the login node, you can activate the conda environment for each @@ -590,7 +592,7 @@ sbatch -N 1 torch_test.sh When you are done, clean up the resources in reverse order of creation: ```text -./ghpc destroy ml-example +./gcluster destroy ml-example ``` Finally, browse to the [Cloud Console][console-images] to delete your custom @@ -614,8 +616,8 @@ VM. The cluster has 2 partitions: To provision the cluster, please run: ```text -./ghpc create examples/ml-slurm.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" -./ghpc deploy ml-example-v6 +./gcluster create examples/ml-slurm.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./gcluster deploy ml-example-v6 ``` After accessing the login node, you can activate the conda environment for each @@ -639,7 +641,7 @@ sbatch -N 1 torch_test.sh When you are done, clean up the resources in reverse order of creation: ```text -./ghpc destroy ml-example-v6 +./gcluster destroy ml-example-v6 ``` Finally, browse to the [Cloud Console][console-images] to delete your custom @@ -670,8 +672,8 @@ example takes the following steps: Create the deployment folder from the blueprint: ```text -./ghpc create examples/image-builder-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" -./ghpc deploy image-builder-001" +./gcluster create examples/image-builder-v5-legacy.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./gcluster deploy image-builder-001" ``` Follow the on-screen prompts to approve the creation of each deployment group. @@ -795,8 +797,8 @@ example takes the following steps: Create the deployment folder from the blueprint: ```text -./ghpc create examples/image-builder.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" -./ghpc deploy image-builder-v6-001" +./gcluster create examples/image-builder.yaml --vars "project_id=${GOOGLE_CLOUD_PROJECT}" +./gcluster deploy image-builder-v6-001" ``` Follow the on-screen prompts to approve the creation of each deployment group. @@ -1277,7 +1279,7 @@ To use the blueprint you must supply the project id and the name of an existing bucket: ```shell -./ghpc create community/examples/client-google-cloud-storage.yaml \ +./gcluster create community/examples/client-google-cloud-storage.yaml \ --vars project_id= \ --vars existing_bucket_name= ``` @@ -1535,6 +1537,50 @@ cleaned up when the job is deleted. [storage-gke.yaml]: ../examples/storage-gke.yaml +### [gke-a3-megagpu.yaml] ![community-badge] ![experimental-badge] + +This blueprint shows how to provision a GKE cluster with A3 Mega machines in the toolkit. + +After provisioning the cluster and the nodepool, we need to do the following: +1. Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl +2. Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector +3. Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload +4. Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl +5. Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests + +> [!Note] +> The Kubernetes API server will only allow requests from authorized networks. +> The `gke-cluster` module needs access to the Kubernetes API server +> to apply a manifest. **You must use +> the `authorized_cidr` variable to supply an authorized network which contains +> the IP address of the machine deploying the blueprint, for example +> `--vars authorized_cidr=/32`.** You can use a service like +> [whatismyip.com](https://whatismyip.com) to determine your IP address. + +[gke-a3-megagpu.yaml]: ../examples/gke-a3-megagpu.yaml + +### [gke-a3-highgpu.yaml] ![community-badge] ![experimental-badge] + +This blueprint shows how to provision a GKE cluster with A3 High machines in the toolkit. + +After provisioning the cluster and the nodepool, we need to do the following: +1. Install the GPUDirect binary and configure NCCL: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#gpudirect-tcpx_2 +2. Deploy NRI device injector plugin: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#nri-device-injector +3. Deploy a test workload: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#test-workload +4. Use recommended NCCL configuration settings to improve performance: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl +5. Add GPUDirect to manifests: https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests + +> [!Note] +> The Kubernetes API server will only allow requests from authorized networks. +> The `gke-cluster` module needs access to the Kubernetes API server +> to apply a manifest. **You must use +> the `authorized_cidr` variable to supply an authorized network which contains +> the IP address of the machine deploying the blueprint, for example +> `--vars authorized_cidr=/32`.** You can use a service like +> [whatismyip.com](https://whatismyip.com) to determine your IP address. + +[gke-a3-highgpu.yaml]: ../examples/gke-a3-highgpu.yaml + ### [htc-htcondor.yaml] ![community-badge] ![experimental-badge] This blueprint provisions an auto-scaling [HTCondor][htcondor] pool based upon @@ -1943,7 +1989,7 @@ To avoid these issues, the `ghpc_stage` function can be used to copy a file (or ``` The `ghpc_stage` function will always look first in the path specified in the blueprint. If the file is not found at this path then `ghpc_stage` will look for the staged file in the deployment folder, if a deployment folder exists. -This means that you can redeploy a blueprint (`ghpc deploy -w`) so long as you have the deployment folder from the original deployment, even if locally referenced files are not available. +This means that you can redeploy a blueprint (`gcluster deploy -w`) so long as you have the deployment folder from the original deployment, even if locally referenced files are not available. ## Requirements diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml new file mode 100644 index 0000000000..de25631fa4 --- /dev/null +++ b/examples/gke-a3-highgpu.yaml @@ -0,0 +1,69 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: gke-a3-highgpu + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gke-a3-highgpu + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gpunets + source: modules/network/multivpc + settings: + network_name_prefix: $(vars.deployment_name)-gpunet + global_ip_address_range: 192.169.0.0/16 + network_count: 4 + subnetwork_cidr_suffix: 24 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gpunets] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. + display_name: "kubectl-access-network" + outputs: [instructions] + + - id: a3-highgpu_pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gpunets] + settings: + machine_type: a3-highgpu-8g + autoscaling_total_min_nodes: 2 + zones: [$(vars.zone)] + +# We need to do the following here after deployment: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-highgpuyaml-- diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml new file mode 100644 index 0000000000..51dbcdb0fc --- /dev/null +++ b/examples/gke-a3-megagpu.yaml @@ -0,0 +1,69 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: gke-a3-mega + +vars: + project_id: ## Set GCP Project ID Here ## + deployment_name: gke-a3-mega + region: us-central1 + zone: us-central1-c + + # Cidr block containing the IP of the machine calling terraform. + # The following line must be updated for this example to work. + authorized_cidr: /32 + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: gpunets + source: modules/network/multivpc + settings: + network_name_prefix: $(vars.deployment_name)-gpunet + global_ip_address_range: 192.169.0.0/16 + network_count: 8 + subnetwork_cidr_suffix: 24 + + - id: gke_cluster + source: modules/scheduler/gke-cluster + use: [network1, gpunets] + settings: + enable_private_endpoint: false # Allows for access from authorized public IPs + master_authorized_networks: + - cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup. + display_name: "kubectl-access-network" + outputs: [instructions] + + - id: a3-megagpu_pool + source: modules/compute/gke-node-pool + use: [gke_cluster, gpunets] + settings: + machine_type: a3-megagpu-8g + autoscaling_total_min_nodes: 2 + zones: [$(vars.zone)] + +# We need to do the following here after deployment: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-megagpuyaml-- diff --git a/examples/hcls-blueprint.yaml b/examples/hcls-blueprint.yaml index ee55925236..271a9f3ba5 100644 --- a/examples/hcls-blueprint.yaml +++ b/examples/hcls-blueprint.yaml @@ -329,11 +329,11 @@ deployment_groups: partition_name: gpu - id: slurm_login - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-login + source: community/modules/scheduler/schedmd-slurm-gcp-v6-login use: [network] - id: slurm_controller - source: ./community/modules/scheduler/schedmd-slurm-gcp-v6-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v6-controller use: - network - compute_partition diff --git a/examples/machine-learning/a3-highgpu-8g/README.md b/examples/machine-learning/a3-highgpu-8g/README.md index 38a967fbe5..5e42a0b075 100644 --- a/examples/machine-learning/a3-highgpu-8g/README.md +++ b/examples/machine-learning/a3-highgpu-8g/README.md @@ -45,23 +45,6 @@ Verify that your release of the Cluster Toolkit is 1.37.0 or later. gcluster --version ``` -The solution requires several Python packages to be available. We recommend -installing them in a Python virtual environment: - -```shell -python3 -m venv toolkit-a3 -source toolkit-a3/bin/activate -pip3 install -r \ - https://raw.githubusercontent.com/GoogleCloudPlatform/slurm-gcp/6.5.13/scripts/requirements.txt -``` - -**Always** activate the environment before running any gcluster commands such as -deploy or destroy. - -```shell -source /absolute/path/to/toolkit-a3/bin/activate -``` - ## Top-Level Design of Solution The solution is split into 3 Cluster Toolkit blueprints: diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml index e14540f0ed..22a4a49e68 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-1-image.yaml @@ -144,6 +144,7 @@ deployment_groups: ENROOT_RUNTIME_PATH /mnt/localssd/${UID}/enroot/runtime ENROOT_CACHE_PATH /mnt/localssd/${UID}/enroot/cache ENROOT_DATA_PATH /mnt/localssd/${UID}/enroot/data + ENROOT_TEMP_PATH /mnt/localssd/${UID}/enroot - type: ansible-local destination: configure_gpu_monitoring.yml content: | diff --git a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml index 6063b2eea8..42a823bf8e 100644 --- a/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml +++ b/examples/machine-learning/a3-highgpu-8g/v5-legacy/ml-slurm-a3-1-image-v5-legacy.yaml @@ -147,6 +147,7 @@ deployment_groups: ENROOT_RUNTIME_PATH /mnt/localssd/${UID}/enroot/runtime ENROOT_CACHE_PATH /mnt/localssd/${UID}/enroot/cache ENROOT_DATA_PATH /mnt/localssd/${UID}/enroot/data + ENROOT_TEMP_PATH /mnt/localssd/${UID}/enroot EOT ### Install Pyxis if [ ! -f "/usr/local/lib/slurm/spank_pyxis.so" ]; then diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml index 2706b8eb60..899fcd6037 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-image.yaml @@ -108,7 +108,7 @@ deployment_groups: apt-get install -y git ansible-galaxy role install googlecloudplatform.google_cloud_ops_agents ansible-pull \ - -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.6.1 \ + -U https://github.com/GoogleCloudPlatform/slurm-gcp -C 6.6.2 \ -i localhost, --limit localhost --connection=local \ -e @/var/tmp/slurm_vars.json \ ansible/playbook.yml @@ -166,6 +166,7 @@ deployment_groups: ENROOT_RUNTIME_PATH /mnt/localssd/${UID}/enroot/runtime ENROOT_CACHE_PATH /mnt/localssd/${UID}/enroot/cache ENROOT_DATA_PATH /mnt/localssd/${UID}/enroot/data + ENROOT_TEMP_PATH /mnt/localssd/${UID}/enroot - type: ansible-local destination: configure_gpu_monitoring.yml content: | diff --git a/examples/pfs-parallelstore.yaml b/examples/pfs-parallelstore.yaml index eac758f660..1858556212 100644 --- a/examples/pfs-parallelstore.yaml +++ b/examples/pfs-parallelstore.yaml @@ -47,6 +47,7 @@ deployment_groups: use: [network, parallelstore] settings: name_prefix: debian + add_deployment_name_before_prefix: true instance_count: 1 instance_image: family: debian-12 @@ -59,6 +60,7 @@ deployment_groups: use: [network, parallelstore] settings: name_prefix: ubuntu + add_deployment_name_before_prefix: true instance_count: 1 instance_image: family: ubuntu-2204-lts diff --git a/examples/ps-slurm.yaml b/examples/ps-slurm.yaml index 11c492d6e3..f139aa7b3c 100644 --- a/examples/ps-slurm.yaml +++ b/examples/ps-slurm.yaml @@ -24,6 +24,14 @@ vars: deployment_name: parallelstore-slurm region: us-east4 zone: us-east4-b + compute_node_machine_type: c2-standard-60 + + # The Parallelstore drivers installation takes a long time. + # Increase the timeout to 20 minutes (default is 5 minutes). + compute_startup_scripts_timeout: $(20*60) + login_startup_scripts_timeout: $(20*60) + controller_startup_scripts_timeout: $(20*60) + resume_timeout: $(20*60) deployment_groups: - group: primary @@ -46,7 +54,7 @@ deployment_groups: use: [network] settings: node_count_dynamic_max: 4 - machine_type: c2-standard-60 + machine_type: $(vars.compute_node_machine_type) enable_placement: false # the default is: true allow_automatic_updates: false diff --git a/examples/serverless-batch.yaml b/examples/serverless-batch.yaml index 538e7d9671..9c0f89c0b9 100644 --- a/examples/serverless-batch.yaml +++ b/examples/serverless-batch.yaml @@ -46,9 +46,6 @@ deployment_groups: machine_type: n2-standard-4 task_count: 8 task_count_per_node: 4 - instance_image: - family: batch-hpc-rocky-linux-8-official - project: batch-custom-image allow_automatic_updates: false - id: batch-login diff --git a/go.mod b/go.mod index 7a7435fb98..417a94e2b7 100644 --- a/go.mod +++ b/go.mod @@ -5,15 +5,15 @@ go 1.21 require ( cloud.google.com/go/storage v1.41.0 // indirect github.com/go-git/go-git/v5 v5.12.0 - github.com/hashicorp/go-getter v1.7.5 + github.com/hashicorp/go-getter v1.7.6 github.com/hashicorp/hcl v1.0.0 // indirect - github.com/hashicorp/hcl/v2 v2.21.0 + github.com/hashicorp/hcl/v2 v2.22.0 github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d github.com/otiai10/copy v1.14.0 github.com/pkg/errors v0.9.1 github.com/spf13/afero v1.11.0 github.com/spf13/cobra v1.8.1 - github.com/zclconf/go-cty v1.14.4 + github.com/zclconf/go-cty v1.15.0 golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa google.golang.org/genproto v0.0.0-20240617180043-68d350f18fd4 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c @@ -98,7 +98,7 @@ require ( golang.org/x/crypto v0.24.0 // indirect golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.21.0 // indirect - golang.org/x/sys v0.21.0 + golang.org/x/sys v0.24.0 golang.org/x/text v0.16.0 // indirect google.golang.org/grpc v1.64.0 // indirect google.golang.org/protobuf v1.34.2 // indirect diff --git a/go.sum b/go.sum index 0c9ffbf1dc..2cff77d322 100644 --- a/go.sum +++ b/go.sum @@ -379,8 +379,8 @@ github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+ github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= -github.com/hashicorp/go-getter v1.7.5 h1:dT58k9hQ/vbxNMwoI5+xFYAJuv6152UNvdHokfI5wE4= -github.com/hashicorp/go-getter v1.7.5/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= +github.com/hashicorp/go-getter v1.7.6 h1:5jHuM+aH373XNtXl9TNTUH5Qd69Trve11tHIrB+6yj4= +github.com/hashicorp/go-getter v1.7.6/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhEyExpmo= github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I= github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mOkIeek= @@ -391,8 +391,8 @@ github.com/hashicorp/hc-install v0.6.4 h1:QLqlM56/+SIIGvGcfFiwMY3z5WGXT066suo/v9 github.com/hashicorp/hc-install v0.6.4/go.mod h1:05LWLy8TD842OtgcfBbOT0WMoInBMUSHjmDx10zuBIA= github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= -github.com/hashicorp/hcl/v2 v2.21.0 h1:lve4q/o/2rqwYOgUg3y3V2YPyD1/zkCLGjIV74Jit14= -github.com/hashicorp/hcl/v2 v2.21.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= +github.com/hashicorp/hcl/v2 v2.22.0 h1:hkZ3nCtqeJsDhPRFz5EA9iwcG1hNWGePOTw6oyul12M= +github.com/hashicorp/hcl/v2 v2.22.0/go.mod h1:62ZYHrXgPoX8xBnzl8QzbWq4dyDsDtfCRgIq1rbJEvA= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d h1:g6kHlvZrFPFKeWRj5q/zyJA5gu7rlJGPf17h8hX7LHY= github.com/hashicorp/terraform-config-inspect v0.0.0-20230925220900-5a6f8d18746d/go.mod h1:l8HcFPm9cQh6Q0KSWoYPiePqMvRFenybP1CH2MjKdlg= github.com/hashicorp/terraform-exec v0.21.0 h1:uNkLAe95ey5Uux6KJdua6+cv8asgILFVWkd/RG0D2XQ= @@ -496,8 +496,8 @@ github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9de github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1Zlc8k= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -github.com/zclconf/go-cty v1.14.4 h1:uXXczd9QDGsgu0i/QFR/hzI5NYCHLf6NQw/atrbnhq8= -github.com/zclconf/go-cty v1.14.4/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= +github.com/zclconf/go-cty v1.15.0 h1:tTCRWxsexYUmtt/wVxgDClUe+uQusuI443uL6e+5sXQ= +github.com/zclconf/go-cty v1.15.0/go.mod h1:VvMs5i0vgZdhYawQNq5kePSpLAoz8u1xvZgrPIxfnZE= github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940 h1:4r45xpDWB6ZMSMNJFMOjqrGHynW3DIBuR2H9j0ug+Mo= github.com/zclconf/go-cty-debug v0.0.0-20240509010212-0d6042c53940/go.mod h1:CmBdvvj3nqzfzJ6nTCIwDTPZ56aVGvDrmztiO5g3qrM= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= @@ -732,8 +732,8 @@ golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.21.0 h1:rF+pYz3DAGSQAxAu1CbC7catZg4ebC4UIeIhKxBZvws= -golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg= +golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= diff --git a/modules/README.md b/modules/README.md index 3796a66b67..8df7ac6d5e 100644 --- a/modules/README.md +++ b/modules/README.md @@ -307,7 +307,7 @@ Terraform modules. A source can either be a filesystem path or a URL to a git repository: * Filesystem paths - * modules embedded in the `ghpc` executable + * modules embedded in the `gcluster` executable * modules in the local filesystem * Remote modules using [Terraform URL syntax](https://developer.hashicorp.com/terraform/language/modules/sources) * Hosted on [GitHub](https://developer.hashicorp.com/terraform/language/modules/sources#github) @@ -324,13 +324,13 @@ deployment folder on your behalf. #### Embedded Modules -Embedded modules are added to the ghpc binary during compilation and cannot +Embedded modules are added to the gcluster binary during compilation and cannot be edited. To refer to embedded modules, set the source path to `modules/<>` or `community/modules/<>`. The paths match the modules in the repository structure for [core modules](./) and [community modules](../community/modules/). Because the modules are embedded -during compilation, your local copies may differ unless you recompile ghpc. +during compilation, your local copies may differ unless you recompile gcluster. For example, this example snippet uses the embedded pre-existing-vpc module: @@ -352,7 +352,7 @@ following module definition refers the local pre-existing-vpc modules. ``` > **_NOTE:_** Relative paths (beginning with `.` or `..` must be relative to the -> working directory from which `ghpc` is executed. This example would have to be +> working directory from which `gcluster` is executed. This example would have to be > run from a local copy of the Cluster Toolkit repository. An alternative is to use > absolute paths to modules. @@ -392,7 +392,7 @@ release of the filestore module: source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/file-system/filestore?ref=v1.22.1&depth=1 ``` -Because Terraform modules natively support this syntax, ghpc will not copy +Because Terraform modules natively support this syntax, gcluster will not copy GitHub-hosted modules into your deployment folder. Terraform will download them into a hidden folder when you run `terraform init`. @@ -403,12 +403,12 @@ into a hidden folder when you run `terraform init`. ##### GitHub-hosted Packer modules -Packer does not natively support GitHub-hosted modules so `ghpc create` will +Packer does not natively support GitHub-hosted modules so `gcluster create` will copy modules into your deployment folder. -If the module uses `//` package notation, `ghpc create` will copy the entire +If the module uses `//` package notation, `gcluster create` will copy the entire repository to the module path: `deployment_name/group_name/module_id`. However, -when `ghpc deploy` is invoked, it will run Packer from the subdirectory +when `gcluster deploy` is invoked, it will run Packer from the subdirectory `deployment_name/group_name/module_id/subdirectory/after/double_slash`. Referring back to the [Intel DAOS blueprint][pfs-daos.yaml], we see that it will @@ -417,10 +417,10 @@ create 2 deployment groups at `pfs-daos/daos-client-image` and a subdirectories ending in `daos-client-image/images` and `daos-server-image/images`. -If the module does not use `//` package notation, `ghpc create` will copy +If the module does not use `//` package notation, `gcluster create` will copy only the final directory in the path to `deployment_name/group_name/module_id`. -In all cases, `ghpc create` will remove the `.git` directory from the packer +In all cases, `gcluster create` will remove the `.git` directory from the packer module to ensure that you can manage the entire deployment directory with its own git versioning. @@ -504,7 +504,7 @@ to `$(network1.network_self_link)` and `$(network1.subnetwork_self_link)` which refer to the [network1 outputs](network/vpc/README#Outputs) of the same names. -The order of precedence that `ghpc` uses in determining when to infer a setting +The order of precedence that `gcluster` uses in determining when to infer a setting value is in the following priority order: 1. Explicitly set in the blueprint using the `settings` field diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 2daf69a794..31a4561457 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -73,6 +73,11 @@ use-cases). In this case, ensure that you turn off the [enable_secure_boot](#input\_enable\_secure\_boot) option to allow unsigned kernel modules to be loaded. +To maximize GPU network bandwidth, nodepools accept multiple VPCs. Pass a multivpc module to gke-node-pool module, and [take these steps] (https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#install-gpudirect-tcpx-nccl) to install GPUDirect, configure NCCL, use recommended settings, and add GPUDirect to your pods. + +> **_NOTE:_** You must [enable multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) feature when creating the GKE cluster. When gke-cluster depends on multivpc (with the use keyword), multi networking will be automatically enabled on the cluster creation. +> When gke-cluster or pre-existing-gke-cluster depends on multivpc (with the use keyword), the [network objects](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment) required for multi networking will be created on the cluster. + ### GPUs Examples There are several ways to add GPUs to a GKE node pool. See @@ -149,7 +154,7 @@ Following is an example of using a GPU attached to an `n1` machine: count: 2 ``` -Finally, the following is an example of using a GPU (with sharing config) attached to an `n1` machine: +The following is an example of using a GPU (with sharing config) attached to an `n1` machine: ```yaml - id: n1-t4-pool @@ -168,6 +173,42 @@ Finally, the following is an example of using a GPU (with sharing config) attach gpu_sharing_strategy: "TIME_SHARING" ``` +Finally, the following is adding multivpc to a node pool: + +```yaml + - id: network + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: multinetwork + source: modules/network/multivpc + settings: + network_name_prefix: multivpc-net + network_count: 8 + global_ip_address_range: 172.16.0.0/12 + subnetwork_cidr_suffix: 16 + + - id: gke-cluster + source: modules/scheduler/gke-cluster + use: [network, multinetwork] + settings: + cluster_name: $(vars.deployment_name) + + - id: a3-megagpu_pool + source: modules/compute/gke-node-pool + use: [gke-cluster, multinetwork] + settings: + machine_type: a3-megagpu-8g + ... +``` + ## License @@ -190,15 +231,15 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.2 | -| [google](#requirement\_google) | > 5.0 | -| [google-beta](#requirement\_google-beta) | > 5.0 | +| [google](#requirement\_google) | ~> 5.0 | +| [google-beta](#requirement\_google-beta) | ~> 5.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | > 5.0 | -| [google-beta](#provider\_google-beta) | > 5.0 | +| [google](#provider\_google) | ~> 5.0 | +| [google-beta](#provider\_google-beta) | ~> 5.0 | ## Modules @@ -221,24 +262,28 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [auto\_upgrade](#input\_auto\_upgrade) | Whether the nodes will be automatically upgraded. | `bool` | `false` | no | | [autoscaling\_total\_max\_nodes](#input\_autoscaling\_total\_max\_nodes) | Total maximum number of nodes in the NodePool. | `number` | `1000` | no | | [autoscaling\_total\_min\_nodes](#input\_autoscaling\_total\_min\_nodes) | Total minimum number of nodes in the NodePool. | `number` | `0` | no | | [cluster\_id](#input\_cluster\_id) | projects/{{project}}/locations/{{location}}/clusters/{{cluster}} | `string` | n/a | yes | -| [compact\_placement](#input\_compact\_placement) | Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes. | `bool` | `false` | no | +| [compact\_placement](#input\_compact\_placement) | DEPRECATED: Use `placement_policy` | `bool` | `null` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for each node. | `number` | `100` | no | | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `null` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | +| [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | -| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info. | `number` | `0` | no | -| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
[See above](#local-ssd-storage) for more info. | `number` | `0` | no | +| [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | +| [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | +| [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | +| [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | diff --git a/modules/compute/gke-node-pool/disk_definitions.tf b/modules/compute/gke-node-pool/disk_definitions.tf new file mode 100644 index 0000000000..f7dbebea0a --- /dev/null +++ b/modules/compute/gke-node-pool/disk_definitions.tf @@ -0,0 +1,36 @@ +/** + * Copyright 2024 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +## Required variables: +# local_ssd_count_ephemeral_storage +# local_ssd_count_nvme_block +# machine_type + +locals { + + local_ssd_machines = { + "a3-highgpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, + "a3-megagpu-8g" = { local_ssd_count_ephemeral_storage = 16, local_ssd_count_nvme_block = null }, + } + + generated_local_ssd_config = lookup(local.local_ssd_machines, var.machine_type, { local_ssd_count_ephemeral_storage = null, local_ssd_count_nvme_block = null }) + + # Select in priority order: + # (1) var.local_ssd_count_ephemeral_storage and var.local_ssd_count_nvme_block if any is not null + # (2) local.local_ssd_machines if not empty + # (3) default to null value for both local_ssd_count_ephemeral_storage and local_ssd_count_nvme_block + local_ssd_config = (var.local_ssd_count_ephemeral_storage == null && var.local_ssd_count_nvme_block == null) ? local.generated_local_ssd_config : { local_ssd_count_ephemeral_storage = var.local_ssd_count_ephemeral_storage, local_ssd_count_nvme_block = var.local_ssd_count_nvme_block } +} diff --git a/modules/compute/gke-node-pool/gpu_definition.tf b/modules/compute/gke-node-pool/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/modules/compute/gke-node-pool/gpu_definition.tf +++ b/modules/compute/gke-node-pool/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 551ba1f5a5..460b640208 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -30,8 +30,10 @@ locals { effect = "NO_SCHEDULE" }] : [] - autoscale_set = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000 - static_node_set = var.static_node_count != null + autoscale_set = var.autoscaling_total_min_nodes != 0 || var.autoscaling_total_max_nodes != 1000 + static_node_set = var.static_node_count != null + reservation_resource_api_label = "compute.googleapis.com/reservation-name" + specific_reservations_count = try(length(var.reservation_affinity.specific_reservations), 0) } data "google_compute_default_service_account" "default_sa" { @@ -67,9 +69,10 @@ resource "google_container_node_pool" "node_pool" { } dynamic "placement_policy" { - for_each = var.compact_placement ? [1] : [] + for_each = var.placement_policy.type != null ? [1] : [] content { - type = "COMPACT" + type = var.placement_policy.type + policy_name = var.placement_policy.name } } @@ -104,12 +107,18 @@ resource "google_container_node_pool" "node_pool" { } } - ephemeral_storage_local_ssd_config { - local_ssd_count = var.local_ssd_count_ephemeral_storage + dynamic "ephemeral_storage_local_ssd_config" { + for_each = local.local_ssd_config.local_ssd_count_ephemeral_storage != null ? [1] : [] + content { + local_ssd_count = local.local_ssd_config.local_ssd_count_ephemeral_storage + } } - local_nvme_ssd_block_config { - local_ssd_count = var.local_ssd_count_nvme_block + dynamic "local_nvme_ssd_block_config" { + for_each = local.local_ssd_config.local_ssd_count_nvme_block != null ? [1] : [] + content { + local_ssd_count = local.local_ssd_config.local_ssd_count_nvme_block + } } shielded_instance_config { @@ -150,6 +159,30 @@ resource "google_container_node_pool" "node_pool" { "net.ipv4.tcp_wmem" = "4096 16384 16777216" } } + + reservation_affinity { + consume_reservation_type = var.reservation_affinity.consume_reservation_type + key = local.specific_reservations_count != 1 ? null : local.reservation_resource_api_label + values = local.specific_reservations_count != 1 ? null : [for reservation in var.reservation_affinity.specific_reservations : reservation.name] + } + + dynamic "host_maintenance_policy" { + for_each = var.host_maintenance_interval != "" ? [1] : [] + content { + maintenance_interval = var.host_maintenance_interval + } + } + } + + network_config { + dynamic "additional_node_network_configs" { + for_each = var.additional_networks + + content { + network = additional_node_network_configs.value.network + subnetwork = additional_node_network_configs.value.subnetwork + } + } } timeouts { @@ -166,9 +199,19 @@ resource "google_container_node_pool" "node_pool" { error_message = "static_node_count cannot be set with either autoscaling_total_min_nodes or autoscaling_total_max_nodes." } precondition { - condition = !(var.local_ssd_count_ephemeral_storage > 0 && var.local_ssd_count_nvme_block > 0) + condition = !(coalesce(local.local_ssd_config.local_ssd_count_ephemeral_storage, 0) > 0 && coalesce(local.local_ssd_config.local_ssd_count_nvme_block, 0) > 0) error_message = "Only one of local_ssd_count_ephemeral_storage or local_ssd_count_nvme_block can be set to a non-zero value." } + precondition { + condition = ( + (var.reservation_affinity.consume_reservation_type != "SPECIFIC_RESERVATION" && local.specific_reservations_count == 0) || + (var.reservation_affinity.consume_reservation_type == "SPECIFIC_RESERVATION" && local.specific_reservations_count == 1) + ) + error_message = <<-EOT + When using NO_RESERVATION or ANY_RESERVATION as the `consume_reservation_type`, `specific_reservations` cannot be set. + On the other hand, with SPECIFIC_RESERVATION you must set `specific_reservations`. + EOT + } } } diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 4cb4bf0af1..251031f108 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -93,24 +93,25 @@ variable "local_ssd_count_ephemeral_storage" { description = <<-EOT The number of local SSDs to attach to each node to back ephemeral storage. Uses NVMe interfaces. Must be supported by `machine_type`. + When set to null, default value either is [set based on machine_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value. [See above](#local-ssd-storage) for more info. EOT type = number - default = 0 + default = null } variable "local_ssd_count_nvme_block" { description = <<-EOT The number of local SSDs to attach to each node to back block storage. Uses NVMe interfaces. Must be supported by `machine_type`. + When set to null, default value either is [set based on machine_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value. [See above](#local-ssd-storage) for more info. EOT type = number - default = 0 + default = null } - variable "autoscaling_total_min_nodes" { description = "Total minimum number of nodes in the NodePool." type = number @@ -170,10 +171,36 @@ variable "spot" { default = false } +# tflint-ignore: terraform_unused_declarations variable "compact_placement" { - description = "Places node pool's nodes in a closer physical proximity in order to reduce network latency between nodes." + description = "DEPRECATED: Use `placement_policy`" type = bool - default = false + default = null + validation { + condition = var.compact_placement == null + error_message = "`compact_placement` is deprecated. Use `placement_policy` instead" + } +} + +variable "placement_policy" { + description = <<-EOT + Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy. + It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement. + Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. + EOT + + type = object({ + type = string + name = optional(string) + }) + default = { + type = null + name = null + } + validation { + condition = var.placement_policy.type == null || try(contains(["COMPACT"], var.placement_policy.type), false) + error_message = "`COMPACT` is the only supported value for `placement_policy.type`." + } } variable "service_account_email" { @@ -265,3 +292,64 @@ variable "service_account" { error_message = "service_account is deprecated and replaced with service_account_email and scopes." } } + +variable "additional_networks" { + description = "Additional network interface details for GKE, if any. Providing additional networks adds additional node networks to the node pool" + default = [] + type = list(object({ + network = string + subnetwork = string + subnetwork_project = string + network_ip = string + nic_type = string + stack_type = string + queue_count = number + access_config = list(object({ + nat_ip = string + network_tier = string + })) + ipv6_access_config = list(object({ + network_tier = string + })) + alias_ip_range = list(object({ + ip_cidr_range = string + subnetwork_range_name = string + })) + })) +} + +variable "reservation_affinity" { + description = <<-EOT + Reservation resource to consume. When targeting SPECIFIC_RESERVATION, specific_reservations needs be specified. + Even though specific_reservations is a list, only one reservation is allowed by the NodePool API. + It is assumed that the specified reservation exists and has available capacity. + For a shared reservation, specify the project_id as well in which it was created. + To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared + EOT + type = object({ + consume_reservation_type = string + specific_reservations = optional(list(object({ + name = string + project = optional(string) + }))) + }) + default = { + consume_reservation_type = "NO_RESERVATION" + specific_reservations = [] + } + validation { + condition = contains(["NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"], var.reservation_affinity.consume_reservation_type) + error_message = "Accepted values are: {NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION}" + } +} + +variable "host_maintenance_interval" { + description = "Specifies the frequency of planned maintenance events." + type = string + default = "" + nullable = false + validation { + condition = contains(["", "PERIODIC", "AS_NEEDED"], var.host_maintenance_interval) + error_message = "Invalid host_maintenance_interval value. Must be PERIODIC, AS_NEEDED or the empty string" + } +} diff --git a/modules/compute/gke-node-pool/versions.tf b/modules/compute/gke-node-pool/versions.tf index 8905a94eea..604ae8f58a 100644 --- a/modules/compute/gke-node-pool/versions.tf +++ b/modules/compute/gke-node-pool/versions.tf @@ -18,14 +18,14 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "> 5.0" + version = "~> 5.0" } google-beta = { source = "hashicorp/google-beta" - version = "> 5.0" + version = "~> 5.0" } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.39.0" } } diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index fb3fde84e6..5e6c75181b 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -133,7 +133,7 @@ can be found at [docs/gpu-support.md](../../../docs/gpu-support.md) The `vm-instance` module will be replaced when the `instance_image` variable is changed and `terraform apply` is run on the deployment group folder or -`ghpc deploy` is run. However, it will not be automatically replaced if a new +`gcluster deploy` is run. However, it will not be automatically replaced if a new image is created in a family. To selectively replace the vm-instance(s), consider running terraform @@ -169,16 +169,16 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.3.0 | -| [google](#requirement\_google) | >= 4.73.0 | -| [google-beta](#requirement\_google-beta) | >= 4.73.0 | +| [google](#requirement\_google) | >= 4.73.0, <6.0 | +| [google-beta](#requirement\_google-beta) | >= 4.73.0, <6.0 | | [null](#requirement\_null) | >= 3.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 4.73.0 | -| [google-beta](#provider\_google-beta) | >= 4.73.0 | +| [google](#provider\_google) | >= 4.73.0, <6.0 | +| [google-beta](#provider\_google-beta) | >= 4.73.0, <6.0 | | [null](#provider\_null) | >= 3.0 | ## Modules @@ -216,7 +216,7 @@ limitations under the License. | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "hpc-centos-7",
"project": "cloud-hpc-image-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count](#input\_local\_ssd\_count) | The number of local SSDs to attach to each VM. See https://cloud.google.com/compute/docs/disks/local-ssd. | `number` | `0` | no | | [local\_ssd\_interface](#input\_local\_ssd\_interface) | Interface to be used with local SSDs. Can be either 'NVME' or 'SCSI'. No effect unless `local_ssd_count` is also set. | `string` | `"NVME"` | no | diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf index c3c16542b1..6c5d96d286 100644 --- a/modules/compute/vm-instance/gpu_definition.tf +++ b/modules/compute/vm-instance/gpu_definition.tf @@ -37,6 +37,7 @@ locals { "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, "a3-highgpu-8g" = { type = "nvidia-h100-80gb", count = 8 }, + "a3-megagpu-8g" = { type = "nvidia-h100-mega-80gb", count = 8 }, "g2-standard-4" = { type = "nvidia-l4", count = 1 }, "g2-standard-8" = { type = "nvidia-l4", count = 1 }, "g2-standard-12" = { type = "nvidia-l4", count = 1 }, diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index f675325187..a874ddf825 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -30,7 +30,7 @@ variable "instance_image" { type = map(string) default = { project = "cloud-hpc-image-public" - family = "hpc-centos-7" + family = "hpc-rocky-linux-8" } validation { diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 1b46a4e5e1..7aeba60707 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -18,12 +18,12 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.73.0" + version = ">= 4.73.0, <6.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.73.0" + version = ">= 4.73.0, <6.0" } null = { source = "hashicorp/null" @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.39.0" } required_version = ">= 1.3.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index c85733d7e1..dc34a97b6e 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.39.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/gke-persistent-volume/versions.tf b/modules/file-system/gke-persistent-volume/versions.tf index adb28ea217..8933fc8dde 100644 --- a/modules/file-system/gke-persistent-volume/versions.tf +++ b/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.39.0" } } diff --git a/modules/file-system/parallelstore/README.md b/modules/file-system/parallelstore/README.md index 37b4788b53..37e28ed2d2 100644 --- a/modules/file-system/parallelstore/README.md +++ b/modules/file-system/parallelstore/README.md @@ -4,6 +4,11 @@ This module creates [parallelstore](https://cloud.google.com/parallelstore) instance. Parallelstore is Google Cloud's first party parallel file system service based on [Intel DAOS](https://docs.daos.io/v2.2/) +### Supported Operating Systems + +A parallelstore instance can be used with Slurm cluster or compute +VM running Ubuntu 22.04, debian 12 or HPC Rocky Linux 8. + ### Parallelstore Quota To get access to a private preview of Parallelstore APIs, your project needs to @@ -31,9 +36,6 @@ issues. You can specify different mount options as follows, For parallelstore instance, Below snippet creates new VPC and configures private-service-access for this newly created network. -The parallelstore instance created here can be used with Slurm cluster or compute -VM running Ubuntu 22.04, debian 12 or HPC Rocky Linux 8. - ```yaml - id: network source: modules/network/vpc diff --git a/modules/file-system/parallelstore/scripts/install-daos-client.sh b/modules/file-system/parallelstore/scripts/install-daos-client.sh index e59f2cfeb9..b94d617e60 100644 --- a/modules/file-system/parallelstore/scripts/install-daos-client.sh +++ b/modules/file-system/parallelstore/scripts/install-daos-client.sh @@ -22,44 +22,69 @@ for arg in "$@"; do fi done -# Install the DAOS client library -# The following commands should be executed on each client vm. -## For Rocky linux 8. -if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then +OS_ID=$(awk -F '=' '/^ID=/ {print $2}' /etc/os-release | sed -e 's/"//g') +OS_VERSION=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g') +OS_VERSION_MAJOR=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g' -e 's/\..*$//') - # 1) Add the Parallelstore package repository - tee /etc/yum.repos.d/parallelstore-v2-4-el8.repo </dev/null 2>&1 & else - echo "Unsupported operating system. This script only supports Rocky Linux 8, Ubuntu 22.04, and Debian 12." + echo "Unsupported operating system ${OS_ID} ${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12." exit 1 fi diff --git a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh index e59f2cfeb9..b94d617e60 100644 --- a/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh +++ b/modules/file-system/pre-existing-network-storage/scripts/install-daos-client.sh @@ -22,44 +22,69 @@ for arg in "$@"; do fi done -# Install the DAOS client library -# The following commands should be executed on each client vm. -## For Rocky linux 8. -if grep -q "ID=\"rocky\"" /etc/os-release && lsb_release -rs | grep -q "8\.[0-9]"; then +OS_ID=$(awk -F '=' '/^ID=/ {print $2}' /etc/os-release | sed -e 's/"//g') +OS_VERSION=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g') +OS_VERSION_MAJOR=$(awk -F '=' '/VERSION_ID/ {print $2}' /etc/os-release | sed -e 's/"//g' -e 's/\..*$//') - # 1) Add the Parallelstore package repository - tee /etc/yum.repos.d/parallelstore-v2-4-el8.repo </dev/null 2>&1 & else - echo "Unsupported operating system. This script only supports Rocky Linux 8, Ubuntu 22.04, and Debian 12." + echo "Unsupported operating system ${OS_ID} ${OS_VERSION}. This script only supports Rocky Linux 8, Redhat 8, Redhat 9, Ubuntu 22.04, and Debian 12." exit 1 fi diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 5f97cdab1b..a26bc82b5b 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/network/firewall-rules/versions.tf b/modules/network/firewall-rules/versions.tf index 485548fdc3..3518992c1a 100644 --- a/modules/network/firewall-rules/versions.tf +++ b/modules/network/firewall-rules/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:firewall-rules/v1.39.0" } required_version = ">= 1.3" diff --git a/modules/network/multivpc/main.tf b/modules/network/multivpc/main.tf index f52b826808..fc7f2b6e9c 100644 --- a/modules/network/multivpc/main.tf +++ b/modules/network/multivpc/main.tf @@ -24,7 +24,7 @@ locals { maximum_subnetworks = pow(2, local.subnetwork_new_bits) additional_networks = [ for vpc in module.vpcs : - merge(var.network_interface_defaults, { subnetwork = vpc.subnetwork_name, subnetwork_project = var.project_id }) + merge(var.network_interface_defaults, { network = vpc.network_name, subnetwork = vpc.subnetwork_name, subnetwork_project = var.project_id }) ] } diff --git a/modules/network/pre-existing-subnetwork/versions.tf b/modules/network/pre-existing-subnetwork/versions.tf index d3524d92f0..55fd273890 100644 --- a/modules/network/pre-existing-subnetwork/versions.tf +++ b/modules/network/pre-existing-subnetwork/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-subnetwork/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 9d9a57638f..ff1c892c78 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 190a2429d0..1dc1d8dd52 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -40,7 +40,7 @@ This can be achieved by one of the following 2 approaches: 1. Configuring a VPC with a Cloud NAT in the region of the VM -- Use the \[vpc\] module which automates NAT creation +- Use the [vpc] module which automates NAT creation ### Inbound internet access @@ -143,10 +143,9 @@ environment. SSH access can be enabled one of 2 ways: - Add firewall rules that open SSH to the VM The Packer template defaults to using to the 1st IAP-based solution because it -is more secure (no exposure to public internet) and because the -[Toolkit VPC module](../../network/vpc/README.md) automatically sets up all -necessary firewall rules for SSH tunneling and outbound-only access to the -internet through [Cloud NAT][cloudnat]. +is more secure (no exposure to public internet) and because the [vpc] module +automatically sets up all necessary firewall rules for SSH tunneling and +outbound-only access to the internet through [Cloud NAT][cloudnat]. In either SSH solution, customization scripts should be supplied as files in the [shell_scripts][shell] and [ansible_playbooks][ansible] settings. @@ -293,7 +292,7 @@ No resources. | [shell\_scripts](#input\_shell\_scripts) | A list of paths to local shell scripts which will be uploaded to customize the VM image | `list(string)` | `[]` | no | | [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance (must set var.enabled\_shielded\_vm) |
object({
enable_secure_boot = bool
enable_vtpm = bool
enable_integrity_monitoring = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [source\_image](#input\_source\_image) | Source OS image to build from | `string` | `null` | no | -| [source\_image\_family](#input\_source\_image\_family) | Alternative to source\_image. Specify image family to build from latest image in family | `string` | `"hpc-centos-7"` | no | +| [source\_image\_family](#input\_source\_image\_family) | Alternative to source\_image. Specify image family to build from latest image in family | `string` | `"hpc-rocky-linux-8"` | no | | [source\_image\_project\_id](#input\_source\_image\_project\_id) | A list of project IDs to search for the source image. Packer will search the
first project ID in the list first, and fall back to the next in the list,
until it finds the source image. | `list(string)` | `null` | no | | [ssh\_username](#input\_ssh\_username) | Username to use for SSH access to VM | `string` | `"hpc-toolkit-packer"` | no | | [startup\_script](#input\_startup\_script) | Startup script (as raw string) used to build the custom Linux VM image (overridden by var.startup\_script\_file if both are set) | `string` | `null` | no | @@ -327,3 +326,4 @@ No outputs. [sss]: #input_startup_script [startup-metadata]: https://cloud.google.com/compute/docs/instances/startup-scripts/linux [startup-script]: ../../../modules/scripts/startup-script +[vpc]: ../../network/vpc/README.md diff --git a/modules/packer/custom-image/variables.pkr.hcl b/modules/packer/custom-image/variables.pkr.hcl index 0fd12991d4..3cede102ce 100644 --- a/modules/packer/custom-image/variables.pkr.hcl +++ b/modules/packer/custom-image/variables.pkr.hcl @@ -99,7 +99,7 @@ variable "source_image" { variable "source_image_family" { description = "Alternative to source_image. Specify image family to build from latest image in family" type = string - default = "hpc-centos-7" + default = "hpc-rocky-linux-8" } variable "service_account_email" { diff --git a/modules/scheduler/batch-job-template/README.md b/modules/scheduler/batch-job-template/README.md index 8cf6e6d276..229c1cfab5 100644 --- a/modules/scheduler/batch-job-template/README.md +++ b/modules/scheduler/batch-job-template/README.md @@ -162,7 +162,7 @@ limitations under the License. | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true, instances will have public IPs | `bool` | `true` | no | | [gcloud\_version](#input\_gcloud\_version) | The version of the gcloud cli being used. Used for output instructions. Valid inputs are `"alpha"`, `"beta"` and "" (empty string for default version) | `string` | `""` | no | | [image](#input\_image) | DEPRECATED: Google Cloud Batch compute node image. Ignored if `instance_template` is provided. | `any` | `null` | no | -| [instance\_image](#input\_instance\_image) | Google Cloud Batch compute node image. Ignored if `instance_template` is provided.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "batch-hpc-centos-7-official",
"project": "batch-custom-image"
}
| no | +| [instance\_image](#input\_instance\_image) | Google Cloud Batch compute node image. Ignored if `instance_template` is provided.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [instance\_template](#input\_instance\_template) | Compute VM instance template self-link to be used for Google Cloud Batch compute node. If provided, a number of other variables will be ignored as noted by `Ignored if instance_template is provided` in descriptions. | `string` | `null` | no | | [job\_filename](#input\_job\_filename) | The filename of the generated job template file. Will default to `cloud-batch-.json` if not specified | `string` | `null` | no | | [job\_id](#input\_job\_id) | An id for the Google Cloud Batch job. Used for output instructions and file naming. Automatically populated by the module id if not set. If setting manually, ensure a unique value across all jobs. | `string` | n/a | yes | diff --git a/modules/scheduler/batch-job-template/variables.tf b/modules/scheduler/batch-job-template/variables.tf index bfce75666e..f65fbd111e 100644 --- a/modules/scheduler/batch-job-template/variables.tf +++ b/modules/scheduler/batch-job-template/variables.tf @@ -197,8 +197,8 @@ variable "instance_image" { EOD type = map(string) default = { - project = "batch-custom-image" - family = "batch-hpc-centos-7-official" + project = "cloud-hpc-image-public" + family = "hpc-rocky-linux-8" } validation { diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index d7a9b6cb1b..9eb2c44c89 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.38.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.39.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index e8bd757493..d446cd8667 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -42,6 +42,39 @@ are created in the VPC module. By default the `gke-cluster` module will look for ranges with the names `pods` and `services`. These names can be configured using the `pods_ip_range_name` and `services_ip_range_name` settings. +### Multi-networking + +To [enable Multi-networking](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment), pass multivpc module to gke-cluster module as described in example below. Passing a multivpc module enables multi networking and [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en) on the cluster. + +```yaml + - id: network + source: modules/network/vpc + settings: + subnetwork_name: gke-subnet + secondary_ranges: + gke-subnet: + - range_name: pods + ip_cidr_range: 10.4.0.0/14 + - range_name: services + ip_cidr_range: 10.0.32.0/20 + + - id: multinetwork + source: modules/network/multivpc + settings: + network_name_prefix: multivpc-net + network_count: 8 + global_ip_address_range: 172.16.0.0/12 + subnetwork_cidr_suffix: 16 + + - id: gke-cluster + source: modules/scheduler/gke-cluster + use: [network, multinetwork] ## enables multi networking and Dataplane V2 on cluster + settings: + cluster_name: $(vars.deployment_name) +``` + +Find an example of multi networking in GKE [here](../../../examples/gke-a3-megagpu.yaml). + ### Cluster Limitations The current implementations has the following limitations: @@ -73,9 +106,10 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.0 | +| [terraform](#requirement\_terraform) | >= 1.3 | | [google](#requirement\_google) | > 5.0 | | [google-beta](#requirement\_google-beta) | > 5.0 | +| [kubectl](#requirement\_kubectl) | >= 1.7.0 | | [kubernetes](#requirement\_kubernetes) | ~> 2.23 | ## Providers @@ -84,6 +118,7 @@ limitations under the License. |------|---------| | [google](#provider\_google) | > 5.0 | | [google-beta](#provider\_google-beta) | > 5.0 | +| [kubectl](#provider\_kubectl) | >= 1.7.0 | ## Modules @@ -103,6 +138,8 @@ limitations under the License. | [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | | [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource | +| [kubectl_manifest.additional_net_params](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.additional_nets](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | | [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source | @@ -110,14 +147,16 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [authenticator\_security\_group](#input\_authenticator\_security\_group) | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no | | [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no | | [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. Used in the GKE cluster name by default and can be configured with `prefix_with_deployment_name`. | `string` | n/a | yes | -| [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. | `bool` | `false` | no | +| [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. If null, will default to false unless using multi-networking, in which case it will default to true | `bool` | `null` | no | | [enable\_filestore\_csi](#input\_enable\_filestore\_csi) | The status of the Filestore Container Storage Interface (CSI) driver addon, which allows the usage of filestore instance as volumes. | `bool` | `false` | no | | [enable\_gcsfuse\_csi](#input\_enable\_gcsfuse\_csi) | The status of the GCSFuse Filestore Container Storage Interface (CSI) driver addon, which allows the usage of a gcs bucket as volumes. | `bool` | `false` | no | | [enable\_master\_global\_access](#input\_enable\_master\_global\_access) | Whether the cluster master is accessible globally (from any region) or only within the same region as the private endpoint. | `bool` | `false` | no | +| [enable\_multi\_networking](#input\_enable\_multi\_networking) | Enables [multi networking](https://cloud.google.com/kubernetes-engine/docs/how-to/setup-multinetwork-support-for-pods#create-a-gke-cluster) (Requires GKE Enterprise). This setting is immutable on clusters and enables [Dataplane V2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2?hl=en). If null, will determine state based on if additional\_networks are passed in. | `bool` | `null` | no | | [enable\_persistent\_disk\_csi](#input\_enable\_persistent\_disk\_csi) | The status of the Google Compute Engine Persistent Disk Container Storage Interface (CSI) driver addon, which allows the usage of a PD as volumes. | `bool` | `true` | no | | [enable\_private\_endpoint](#input\_enable\_private\_endpoint) | (Beta) Whether the master's internal IP address is used as the cluster endpoint. | `bool` | `true` | no | | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 59e6822a19..efd2a30dde 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -30,6 +30,12 @@ locals { }] sa_email = var.service_account_email != null ? var.service_account_email : data.google_compute_default_service_account.default_sa.email + + # additional VPCs enable multi networking + derived_enable_multi_networking = coalesce(var.enable_multi_networking, length(var.additional_networks) > 0) + + # multi networking needs enabled Dataplane v2 + derived_enable_dataplane_v2 = coalesce(var.enable_dataplane_v2, local.derived_enable_multi_networking) } data "google_compute_default_service_account" "default_sa" { @@ -85,7 +91,9 @@ resource "google_container_cluster" "gke_cluster" { autoscaling_profile = var.autoscaling_profile } - datapath_provider = var.enable_dataplane_v2 ? "ADVANCED_DATAPATH" : "LEGACY_DATAPATH" + datapath_provider = local.derived_enable_dataplane_v2 ? "ADVANCED_DATAPATH" : "LEGACY_DATAPATH" + + enable_multi_networking = local.derived_enable_multi_networking network_policy { # Enabling NetworkPolicy for clusters with DatapathProvider=ADVANCED_DATAPATH @@ -168,6 +176,14 @@ resource "google_container_cluster" "gke_cluster" { ignore_changes = [ node_config ] + precondition { + condition = !(!coalesce(var.enable_dataplane_v2, true) && local.derived_enable_multi_networking) + error_message = "'enable_dataplane_v2' cannot be false when enabling multi networking." + } + precondition { + condition = !(!coalesce(var.enable_multi_networking, true) && length(var.additional_networks) > 0) + error_message = "'enable_multi_networking' cannot be false when using multivpc module, which passes additional_networks." + } } logging_service = "logging.googleapis.com/kubernetes" @@ -315,3 +331,50 @@ module "workload_identity" { google_container_cluster.gke_cluster ] } + +provider "kubectl" { + host = "https://${google_container_cluster.gke_cluster.endpoint}" + cluster_ca_certificate = base64decode(google_container_cluster.gke_cluster.master_auth[0].cluster_ca_certificate) + token = data.google_client_config.default.access_token + load_config_file = false +} + +resource "kubectl_manifest" "additional_net_params" { + for_each = { for idx, network_info in var.additional_networks : idx => network_info } + + depends_on = [google_container_cluster.gke_cluster] + + yaml_body = < network_info } + + depends_on = [google_container_cluster.gke_cluster, kubectl_manifest.additional_net_params] + + yaml_body = < **_NOTE:_** The `project_id` and `region` settings would be inferred from the > deployment variables of the same name, but they are included here for clarity. +### Multi-networking + +To create network objects in GKE cluster, you can pass a multivpc module to a pre-existing-gke-cluster module instead of [applying a manifest manually](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#create-gke-environment). + +```yaml + - id: network + source: modules/network/vpc + + - id: multinetwork + source: modules/network/multivpc + settings: + network_name_prefix: multivpc-net + network_count: 8 + global_ip_address_range: 172.16.0.0/12 + subnetwork_cidr_suffix: 16 + + - id: existing-gke-cluster ## multinetworking must be enabled in advance when cluster creation + source: modules/scheduler/pre-existing-gke-cluster + use: [multinetwork] + settings: + cluster_name: $(vars.deployment_name) +``` + ## License @@ -52,14 +75,16 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [terraform](#requirement\_terraform) | >= 1.3 | | [google](#requirement\_google) | > 5.0 | +| [kubectl](#requirement\_kubectl) | >= 1.7.0 | ## Providers | Name | Version | |------|---------| | [google](#provider\_google) | > 5.0 | +| [kubectl](#provider\_kubectl) | >= 1.7.0 | ## Modules @@ -69,12 +94,16 @@ No modules. | Name | Type | |------|------| +| [kubectl_manifest.additional_net_params](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [kubectl_manifest.additional_nets](https://registry.terraform.io/providers/gavinbunney/kubectl/latest/docs/resources/manifest) | resource | +| [google_client_config.default](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/client_config) | data source | | [google_container_cluster.existing_gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [cluster\_name](#input\_cluster\_name) | Name of the existing cluster | `string` | n/a | yes | | [project\_id](#input\_project\_id) | Project that hosts the existing cluster | `string` | n/a | yes | | [region](#input\_region) | Region in which to search for the cluster | `string` | n/a | yes | diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf index c59e35e8da..4a99e90487 100644 --- a/modules/scheduler/pre-existing-gke-cluster/main.tf +++ b/modules/scheduler/pre-existing-gke-cluster/main.tf @@ -19,3 +19,52 @@ data "google_container_cluster" "existing_gke_cluster" { project = var.project_id location = var.region } + +data "google_client_config" "default" {} + +provider "kubectl" { + host = "https://${data.google_container_cluster.existing_gke_cluster.endpoint}" + token = data.google_client_config.default.access_token + cluster_ca_certificate = base64decode(data.google_container_cluster.existing_gke_cluster.master_auth[0].cluster_ca_certificate) + load_config_file = false +} + +resource "kubectl_manifest" "additional_net_params" { + for_each = { for idx, network_info in var.additional_networks : idx => network_info } + + depends_on = [data.google_container_cluster.existing_gke_cluster] + + yaml_body = < network_info } + + depends_on = [data.google_container_cluster.existing_gke_cluster, kubectl_manifest.additional_net_params] + + yaml_body = < str: """Downloads a file from Google Cloud Storage to a local destination. Args: @@ -53,10 +56,17 @@ def unpack_tgz(tar_file: str, destination_folder: str): with tarfile.open(tar_file, "r:gz") as tar: tar.extractall(destination_folder) +def gcluster_path() -> str: + gcluster = "gcluster" + if os.path.exists(gcluster): + return f"./{gcluster}" + if shutil.which(gcluster) is not None: + return gcluster # it's in PATH + raise RuntimeError(f"Could not find {gcluster} in PATH or current directory") + def destroy(deployment_folder: str) -> bool: - import subprocess - import sys - process = subprocess.Popen(["./ghpc" , "destroy", deployment_folder, "--auto-approve"], stdout=subprocess.PIPE) + cmd = f"{gcluster_path()} destroy {deployment_folder} --auto-approve" + process = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE) for line in iter(lambda: process.stdout.read(1), b""): sys.stdout.buffer.write(line) process.wait() @@ -65,19 +75,13 @@ def destroy(deployment_folder: str) -> bool: stdout, stderr = process.communicate() print(f'stdout: {stdout}') print(f'stderr: {stderr}\n\n') - print("Deployment destroy failed. Command to manually destroy:") - print(f"./ghpc destroy {deployment_folder} --auto-approve") + print(f"Deployment destroy failed. Command to manually destroy:\n{cmd}") return False print("Deployment destroyed") return True -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("project_id", help="Your Google Cloud project ID.") - parser.add_argument("gcs_tar_path", help="The path to the GCS tar file.") - args = parser.parse_args() - +def main(args: argparse.Namespace) -> None: print('Downloading tgz file') tgz_file = cp_from_gcs(args.gcs_tar_path, ".", args.project_id) @@ -92,4 +96,8 @@ def main(): shutil.rmtree(deployment_folder) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--project_id", type=str, default="hpc-toolkit-dev", help="Your Google Cloud project ID.") + parser.add_argument("gcs_tar_path", help="The path to the GCS tar file.") + + main(parser.parse_args()) diff --git a/tools/cloud-build/README.md b/tools/cloud-build/README.md index 3bf30cacdf..175d2f8e65 100644 --- a/tools/cloud-build/README.md +++ b/tools/cloud-build/README.md @@ -4,7 +4,7 @@ * `daily-tests`: The daily-tests directory contains cloud build configs and support files for running the daily test suite -* `dependency-checks`: Verifies the `ghpc` build in limited dependency +* `dependency-checks`: Verifies the `gcluster` build in limited dependency environments. * `ansible.cfg`: Ansible config used to set common ansible setting for running the test suite. diff --git a/tools/cloud-build/babysit/cli_ui.py b/tools/cloud-build/babysit/cli_ui.py index 8831bbe070..503a9bef34 100644 --- a/tools/cloud-build/babysit/cli_ui.py +++ b/tools/cloud-build/babysit/cli_ui.py @@ -62,7 +62,10 @@ def sleep(self, sec: int) -> None: time.sleep(sec) def _render_summary(self, builds: Sequence[Build]) -> None: - order_fn = lambda bc: (bc.build.status, trig_name(bc.build)) + status_order = { # show success and pending first (as less interesting) + Status.SUCCESS: 0, + Status.PENDING: 1} + order_fn = lambda bc: (status_order.get(bc.build.status, 100), bc.build.status, trig_name(bc.build)) cnt = defaultdict(int) ordered = sorted(latest_by_trigger(builds).values(), key=order_fn) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml index f515e26aac..034c99c54f 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/create_deployment_directory.yml @@ -32,9 +32,8 @@ - name: Create Blueprint ansible.builtin.command: | - ./ghpc create -l ERROR "{{ blueprint_yaml }}" \ + ./gcluster create -l ERROR "{{ blueprint_yaml }}" \ --backend-config bucket={{ state_bucket }} \ - --skip-validators=test_tf_version_for_slurm \ --vars project_id={{ project }} \ --vars deployment_name={{ deployment_name }} \ {{ deployment_vars_str if deployment_vars_str is defined else '' }} diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml index 7144fc4c23..4f7a08c4c0 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml @@ -62,7 +62,7 @@ steps: NFS_DEPLOYMENT_NAME="a3hnfs$${BUILD_ID_SHORT}" destroy_on_exit() { - ./ghpc destroy "$${NFS_DEPLOYMENT_NAME}" --auto-approve + ./gcluster destroy "$${NFS_DEPLOYMENT_NAME}" --auto-approve cat /persistent_volume/image_name | xargs -L1 gcloud compute images delete --project "${PROJECT_ID}" --quiet } @@ -70,7 +70,7 @@ steps: ZONE=us-east4-a trap 'destroy_on_exit' EXIT - ./ghpc deploy \ + ./gcluster deploy \ --vars region="$${REGION}" \ --vars zone="$${ZONE}" \ --vars project_id="${PROJECT_ID}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index 4336929f16..c9ae96850f 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -49,7 +49,7 @@ steps: echo ' - id: ubuntu_pool' >> $${SG_EXAMPLE} echo ' source: modules/compute/gke-node-pool' >> $${SG_EXAMPLE} echo ' use: [gke_cluster]' >> $${SG_EXAMPLE} - echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD}' >> $${SG_EXAMPLE} + echo ' settings: {name: ubuntu, image_type: UBUNTU_CONTAINERD, host_maintenance_interval: AS_NEEDED}' >> $${SG_EXAMPLE} # avoids conflict with other tests sed -i "s/gke-subnet/gke-subnet-$${BUILD_ID_SHORT}/" $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/builds/monitoring.yaml b/tools/cloud-build/daily-tests/builds/monitoring.yaml index 52e97c23cc..810fa650ff 100644 --- a/tools/cloud-build/daily-tests/builds/monitoring.yaml +++ b/tools/cloud-build/daily-tests/builds/monitoring.yaml @@ -36,7 +36,7 @@ steps: - -c - | set -x -e - cd /workspace && make ghpc + cd /workspace && make gcluster BUILD_ID_FULL=$BUILD_ID BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} diff --git a/tools/cloud-build/daily-tests/builds/ps-vm-debian.yaml b/tools/cloud-build/daily-tests/builds/ps-vm-debian.yaml new file mode 100644 index 0000000000..97fd3b2a30 --- /dev/null +++ b/tools/cloud-build/daily-tests/builds/ps-vm-debian.yaml @@ -0,0 +1,41 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +tags: +- m.vpc +- m.private-service-access +- m.parallelstore +- m.vm-instance +- vm + +timeout: 14400s # 4hr +steps: +- id: parallelstore-vm + name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner + entrypoint: /bin/bash + env: + - "ANSIBLE_HOST_KEY_CHECKING=false" + - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" + args: + - -c + - | + set -x -e + cd /workspace && make + BUILD_ID_FULL=$BUILD_ID + BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} + + ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="@tools/cloud-build/daily-tests/tests/ps-vm-debian.yml" diff --git a/tools/cloud-build/daily-tests/tests/ps-slurm.yml b/tools/cloud-build/daily-tests/tests/ps-slurm.yml index f487f4bde5..eea04f27c8 100644 --- a/tools/cloud-build/daily-tests/tests/ps-slurm.yml +++ b/tools/cloud-build/daily-tests/tests/ps-slurm.yml @@ -24,6 +24,7 @@ slurm_cluster_name: "psslurm{{ build[0:3] }}" cli_deployment_vars: region: "{{ region }}" zone: "{{ zone }}" + compute_node_machine_type: c2-standard-4 # Note: Pattern matching in gcloud only supports 1 wildcard. login_node: "{{ slurm_cluster_name }}-slurm-login-*" controller_node: "{{ slurm_cluster_name }}-controller" diff --git a/tools/cloud-build/daily-tests/tests/ps-vm-debian.yml b/tools/cloud-build/daily-tests/tests/ps-vm-debian.yml new file mode 100644 index 0000000000..91494b3820 --- /dev/null +++ b/tools/cloud-build/daily-tests/tests/ps-vm-debian.yml @@ -0,0 +1,29 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +test_name: test-parallelstore-vm-debian +deployment_name: "parallelstore-vm-{{ build }}" +region: us-central1 +zone: us-central1-a +workspace: /workspace +blueprint_yaml: "{{ workspace }}/examples/pfs-parallelstore.yaml" +network: "{{ deployment_name }}-net" +remote_node: "{{ deployment_name }}-debian-0" +post_deploy_tests: +- test-validation/test-mounts.yml +custom_vars: + mounts: + - /parallelstore diff --git a/tools/cloud-build/images/ghpc-docker/Dockerfile b/tools/cloud-build/images/ghpc-docker/Dockerfile index f4b2e29b4d..746452b921 100644 --- a/tools/cloud-build/images/ghpc-docker/Dockerfile +++ b/tools/cloud-build/images/ghpc-docker/Dockerfile @@ -18,7 +18,7 @@ ARG ref RUN git clone https://github.com/GoogleCloudPlatform/hpc-toolkit.git &&\ cd hpc-toolkit &&\ git checkout ${ref} &&\ - make ghpc &&\ + make gcluster &&\ go install github.com/google/go-licenses@latest &&\ /go/bin/go-licenses check "./..." &&\ /go/bin/go-licenses save "./..." --save_path="THIRD_PARTY_NOTICES" diff --git a/tools/cloud-build/images/test-runner/Dockerfile b/tools/cloud-build/images/test-runner/Dockerfile index 5538328dfe..f7f8ca2708 100644 --- a/tools/cloud-build/images/test-runner/Dockerfile +++ b/tools/cloud-build/images/test-runner/Dockerfile @@ -46,6 +46,6 @@ RUN curl -fsSL https://apt.releases.hashicorp.com/gpg | apt-key add - && \ pip install --no-cache-dir ansible && \ rm -rf ~/.cache/pip/* && \ # compile the binary to warm up `/ghpc_go_cache` - cd /workspace && make ghpc && \ + cd /workspace && make gcluster && \ # remove /workspace to reduce image size rm -rf /workspace diff --git a/tools/serial_port_collector.py b/tools/serial_port_collector.py new file mode 100644 index 0000000000..7e1f78210e --- /dev/null +++ b/tools/serial_port_collector.py @@ -0,0 +1,69 @@ +#!/bin/python3 +# Copyright 2024 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import time +from google.cloud import compute_v1 +from argparse import ArgumentParser, RawTextHelpFormatter + +"""This tool collects serial port output and prints it to the terminal until +the VM is deleted or it hits the timeout (300s). It takes in, project, vm_name +and zone as arguments. The script should only print each line once, using the +line number of the previous serial port retrieval as the starting point of the +next request. + +usage: serial_port_collector.py [-h] -p PROJECT -v VM_NAME -z ZONE [-t TIMEOUT] +""" + +def get_serial_port_output(host_name: str, project: str, zone: str, + start: int = 0) -> str: + # Create a client + client = compute_v1.InstancesClient() + # Initialize request argument(s) + request = compute_v1.GetSerialPortOutputInstanceRequest( + instance=host_name, + project=project, + zone=zone, + start=start, + ) + # Make the request + res = client.get_serial_port_output(request=request) + return res.contents, res.next_ + +if __name__ == "__main__": + parser = ArgumentParser(prog='serial_port_collector.py', + formatter_class=RawTextHelpFormatter) + parser.add_argument("-p", "--project", required=True, type=str, + help="Project where the vm is located") + parser.add_argument("-v", "--vm_name", required=True, type=str, + help="VM name to collect serial port output from") + parser.add_argument("-z", "--zone", required=True, type=str, + help="The zone the vm is located in") + parser.add_argument("-t", "--timeout", type=int, default = 0, + help="Timeout in seconds waiting for the next output "\ + "(values <= 0 are no timeout)") + + args = parser.parse_args() + to = args.timeout + + next=0 + sleep_timer = 2 + ts = time.time() + while to <= 0 or time.time()-ts < to: + out, next = get_serial_port_output(args.vm_name, args.project, + args.zone, next) + if len(out) > 0: + print(out) + ts = time.time() + time.sleep(sleep_timer) diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl index 0fd12991d4..3cede102ce 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl @@ -99,7 +99,7 @@ variable "source_image" { variable "source_image_family" { description = "Alternative to source_image. Specify image family to build from latest image in family" type = string - default = "hpc-centos-7" + default = "hpc-rocky-linux-8" } variable "service_account_email" { diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl index 0fd12991d4..3cede102ce 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl @@ -99,7 +99,7 @@ variable "source_image" { variable "source_image_family" { description = "Alternative to source_image. Specify image family to build from latest image in family" type = string - default = "hpc-centos-7" + default = "hpc-rocky-linux-8" } variable "service_account_email" { diff --git a/tools/validate_configs/golden_copies/validate.sh b/tools/validate_configs/golden_copies/validate.sh index bc03493aaf..4e0b4f196a 100755 --- a/tools/validate_configs/golden_copies/validate.sh +++ b/tools/validate_configs/golden_copies/validate.sh @@ -42,7 +42,7 @@ run_test() { cp -r tools/validate_configs/golden_copies/configs/files "${tmpdir}/" # Only run from the repo directory if there are local modules, otherwise - # run the test from the test directory using the installed ghpc binary. + # run the test from the test directory using the installed gcluster binary. if grep -q "${LOCAL_SOURCE_PATTERN}" "${cwd}/${bp}"; then cd "${cwd}" else diff --git a/tools/validate_configs/test_configs/README.md b/tools/validate_configs/test_configs/README.md index 812f8449d5..00b1f4fa58 100644 --- a/tools/validate_configs/test_configs/README.md +++ b/tools/validate_configs/test_configs/README.md @@ -3,8 +3,8 @@ This directory contains a set of test blueprint files that can be fed into gHPC to create a deployment. These blueprints are used to run integration tests -against `ghpc`. These blueprints can also be used independently and locally to -verify a local `ghpc` build. +against `gcluster`. These blueprints can also be used independently and locally to +verify a local `gcluster` build. ## Blueprint Descriptions diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh index fb92005498..996b25006f 100755 --- a/tools/validate_configs/validate_configs.sh +++ b/tools/validate_configs/validate_configs.sh @@ -26,7 +26,7 @@ run_test() { exampleFile=$(basename "$example") DEPLOYMENT=$(echo "${exampleFile%.yaml}-$(basename "${tmpdir##*.}")" | sed -e 's/\(.*\)/\L\1/') PROJECT="invalid-project" - VALIDATORS_TO_SKIP="test_project_exists,test_apis_enabled,test_region_exists,test_zone_exists,test_zone_in_region,test_tf_version_for_slurm" + VALIDATORS_TO_SKIP="test_project_exists,test_apis_enabled,test_region_exists,test_zone_exists,test_zone_in_region" GHPC_PATH="${cwd}/ghpc" BP_PATH="${cwd}/${example}" # Cover the three possible starting sequences for local sources: ./ ../ / @@ -35,7 +35,7 @@ run_test() { echo "testing ${example} in ${tmpdir}" # Only run from the repo directory if there are local modules, otherwise - # run the test from the test directory using the installed ghpc binary. + # run the test from the test directory using the installed gcluster binary. if grep -q "${LOCAL_SOURCE_PATTERN}" "${cwd}/${example}"; then cd "${cwd}" else @@ -45,7 +45,7 @@ run_test() { --skip-validators="${VALIDATORS_TO_SKIP}" "${deployment_args[@]}" \ --vars="project_id=${PROJECT},deployment_name=${DEPLOYMENT}" >/dev/null || { - echo "*** ERROR: error creating deployment with ghpc for ${exampleFile}" + echo "*** ERROR: error creating deployment with gcluster for ${exampleFile}" exit 1 } if grep -q "${LOCAL_SOURCE_PATTERN}" "${cwd}/${example}"; then