Skip to content

Commit

Permalink
chore: deprecating container_runtime config, agentrm supporting singu…
Browse files Browse the repository at this point in the history
…larity,podman, and apptainer (#9516)
  • Loading branch information
ShreyaLnuHpe authored Jul 15, 2024
1 parent 4eeb4db commit c3e0a41
Show file tree
Hide file tree
Showing 16 changed files with 50 additions and 1,710 deletions.
166 changes: 1 addition & 165 deletions .circleci/real_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2784,64 +2784,6 @@ jobs:
sleep 5
done
# For when a user wants to use an agent instead of launcher.
- when:
condition:
equal: ["-A", <<parameters.agent-use>>]
steps:
- wait-for-master:
scheme: <<parameters.master-scheme>>
host: <<parameters.master-host>>
port: <<parameters.master-port>>
- run:
name: Transfer and Allocate agent resources on VM
command: |
ZONE=$(terraform -chdir=tools/slurm/terraform output --raw zone)
INSTANCE_NAME=$(terraform -chdir=tools/slurm/terraform output --raw instance_name)
PROJECT=$(terraform -chdir=tools/slurm/terraform output --raw project)
gcloud compute scp agent/build/determined-agent "$INSTANCE_NAME":~ --zone $ZONE
gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- \
srun determined-agent --master-host=<<parameters.master-host>> --master-port=<<parameters.master-port>> --resource-pool=default --container-runtime=<<parameters.container-run-type>>
background: true
- run:
name: Query the job to ensure Determined Agent is running
command: |
# 60 tries gives 30 minutes of tries until the query times out
tries=60
# squeue command gives name and state information. There must be an escape character for the gcloud command.
ZONE=$(terraform -chdir=tools/slurm/terraform output --raw zone)
INSTANCE_NAME=$(terraform -chdir=tools/slurm/terraform output --raw instance_name)
PROJECT=$(terraform -chdir=tools/slurm/terraform output --raw project)
gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- squeue -o "%j\ %T"
# Queries until the jobname is shown in a running state. The agent job must be running to run e2e tests
until [[ -n $(gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- squeue -o "%j\ %T" | grep "determined-agent" | grep "RUNNING") ]] ; do
if [[ $((--tries)) -eq 0 ]]; then
echo "The job determined-agent did not start. Please check if there are other jobs in the queue preventing this job from starting"
exit 1
fi
echo "Waiting 30s to query for the job name again..."
sleep 30
echo "Retrying job query..."
gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- squeue -o "%j\ %T"
done
- run:
name: Query the slot count to ensure slots are allocated
command: |
tries=20
export DET_USER=determined
export DET_PASS=${INITIAL_USER_PASSWORD}
det slot list
until [[ $(det slot list | wc -l) -gt 2 ]] ; do
if [[ $((--tries)) -eq 0 ]]; then
echo "ERROR: determined-agent failed to register at least 2 slots. Check the 'Transfer and Allocate agent resources on VM' for any failures."
exit 1
fi
echo "Waiting 5s to query slots again..."
sleep 5
echo "Retrying slot query..."
det slot list
done
- run-e2e-tests:
mark: <<parameters.mark>>
wait-for-master: true
Expand Down Expand Up @@ -3124,62 +3066,7 @@ jobs:
}
}
EOF
- when:
condition:
equal: ["-A", <<parameters.agent-use>>]
steps:
- run:
name: Allocate Resources on Cluster
# Sed command reduces the slots to 8 to match '-N2 --gpus-per-node=4' usage below.
# Affects test e2e_tests/tests/cluster/test_slurm.py:test_mnist_pytorch_distributed
command: |
DET_MASTER_HOST=<<parameters.determined_master_host>>
# determined_master_host is localhost, so use actual hostname to pass to the agent
MASTER_HOST=$(hostname)
MASTER_PORT=$(echo $DET_MASTER_HOST | cut -d: -f2)
sed -i 's/slots_per_trial: 16/slots_per_trial: 8/' examples/tutorials/mnist_pytorch/distributed.yaml
sudo cp agent/dist/determined-agent_linux_amd64_v1/determined-agent /scratch/launcher/.launcher.$HOSTNAME
# Include 40 minute time limit, name the job (determined-agent-$HOSTNAME) so we can selectively kill it
sudo srun --uid launcher --export=ALL -N2 --gpus-per-node=4 --time=40 -Jdetermined-agent-$HOSTNAME \
--chdir=/scratch/launcher/.launcher.$HOSTNAME /scratch/launcher/.launcher.$HOSTNAME/determined-agent \
--master-host=$MASTER_HOST --master-port=$MASTER_PORT --resource-pool=default --container-runtime=singularity --slot-type=cuda \
--image-root=/lustre/hdd/foundation_engineering/images
background: true
- run:
name: Query the job to ensure Determined Agent is running
command: |
# 60 tries gives 30 minutes of tries until the query times out
tries=60
squeue -u launcher -o "%j %T"
# Queries until the jobname is shown in a running state. The agent job must be running to run e2e tests
until [[ -n $(squeue -u launcher -o "%j %T" | grep "determined-agent-$HOSTNAME" | grep "RUNNING") ]] ; do
if [[ $((--tries)) -eq 0 ]]; then
echo "The job determined-agent-$HOSTNAME did not start. Please check if there are jobs in the queue preventing this job from starting"
exit 1
fi
echo "Waiting 30s to query for the job name again..."
sleep 30
echo "Retrying job query..."
squeue -u launcher -o "%j %T"
done
- run:
name: Query the slot count to ensure slots are allocated
command: |
tries=20
export DET_PASS=${INITIAL_USER_PASSWORD}
det slot list
until [[ $(det slot list | wc -l) -gt 2 ]] ; do
if [[ $((--tries)) -eq 0 ]]; then
echo "ERROR: determined-agent failed to register at least 2 slots. Check the 'Transfer and Allocate agent resources on VM' for any failures."
exit 1
fi
echo "Waiting 5s to query slots again..."
sleep 5
echo "Retrying slot query..."
det slot list
done
- run-e2e-tests:
mark: <<parameters.mark>>
master-host: localhost
Expand All @@ -3190,15 +3077,6 @@ jobs:
- store_test_results:
path: /tmp/test-results/

- when:
condition:
equal: ["-A", <<parameters.agent-use>>]
steps:
- run:
name: Deallocate Agent Resources on cluster
command: |
scancel -u $USER --jobname=determined-agent-$HOSTNAME
test-e2e:
parameters:
tf2:
Expand Down Expand Up @@ -4511,28 +4389,6 @@ workflows:
only:
- main

# Podman over SLURM test using Agent on GCP
- test-e2e-hpc-gcp:
context:
# Provides the GITHUB_USERNAME and GITHUB_TOKEN enviroment variable
# that's required by the "gh" command for authentication.
- github-read
- gcp
- gcp-ci-cluster-default-user-credentials
matrix:
parameters:
name: [test-e2e-slurm-agent-podman-gcp]
agent-use: ["-A"]
container-run-type: ["podman"]
mark: ["e2e_slurm and not parallel and not gpu_required"]
extra-pytest-flags: ["-k 'not test_slurm_verify_home'"]
requires:
- build-go-ee
filters:
branches:
only:
- main

- test-e2e:
name: test-e2e-rbac
context:
Expand Down Expand Up @@ -5413,26 +5269,6 @@ workflows:
- build-go
- request-hpc-tests

# Podman over SLURM test using Agent on GCP
- test-e2e-hpc-gcp:
filters: *upstream-feature-branch
context:
# Provides the GITHUB_USERNAME and GITHUB_TOKEN enviroment variable
# that's required by the "gh" command for authentication.
- github-read
- gcp
- gcp-ci-cluster-default-user-credentials
matrix:
parameters:
name: [test-e2e-slurm-agent-podman-gcp]
agent-use: ["-A"]
container-run-type: ["podman"]
mark: ["e2e_slurm and not parallel and not gpu_required"]
extra-pytest-flags: ["-k 'not test_slurm_verify_home'"]
requires:
- build-go
- request-hpc-tests

nightly:
when: << pipeline.parameters.do_nightly_tests >>
jobs:
Expand Down
2 changes: 0 additions & 2 deletions agent/cmd/determined-agent/init.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,6 @@ func registerAgentConfig() {
registerBool(flags, name("debug"), defaults.Debug, "Enable verbose script output")
registerInt(flags, name("artificial-slots"), defaults.ArtificialSlots, "")
flags.Lookup("artificial-slots").Hidden = true
registerString(flags, name("image-root"), defaults.ImageRoot,
"Path to local container image cache")

// Endpoint TLS flags.
registerBool(flags, name("tls"), defaults.TLS, "Use TLS for the API server")
Expand Down
57 changes: 16 additions & 41 deletions agent/internal/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ import (
"github.com/determined-ai/determined/agent/internal/options"
"github.com/determined-ai/determined/agent/pkg/docker"
"github.com/determined-ai/determined/agent/pkg/events"
"github.com/determined-ai/determined/agent/pkg/podman"
"github.com/determined-ai/determined/agent/pkg/singularity"
"github.com/determined-ai/determined/master/pkg/aproto"
"github.com/determined-ai/determined/master/pkg/cproto"
"github.com/determined-ai/determined/master/pkg/device"
Expand Down Expand Up @@ -121,46 +119,23 @@ func (a *Agent) run(ctx context.Context) error {
}

a.log.Tracef("setting up %s runtime", a.opts.ContainerRuntime)
var cruntime container.ContainerRuntime
switch a.opts.ContainerRuntime {
case options.PodmanContainerRuntime:
acl, sErr := podman.New(a.opts)
if sErr != nil {
return fmt.Errorf("failed to build podman client: %w", sErr)
}
defer func() {
if cErr := acl.Close(); cErr != nil {
a.log.WithError(cErr).Error("failed to close podman client")
}
}()
cruntime = acl
case options.ApptainerContainerRuntime:
fallthrough
case options.SingularityContainerRuntime:
acl, sErr := singularity.New(a.opts)
if sErr != nil {
return fmt.Errorf("failed to build singularity client: %w", sErr)
}
defer func() {
if cErr := acl.Close(); cErr != nil {
a.log.WithError(cErr).Error("failed to close singularity client")
}
}()
cruntime = acl
case options.DockerContainerRuntime:
dcl, dErr := dclient.NewClientWithOpts(dclient.WithAPIVersionNegotiation(), dclient.FromEnv)
if dErr != nil {
return fmt.Errorf("failed to build docker client: %w", dErr)
}
defer func() {
a.log.Trace("cleaning up docker client")
if cErr := dcl.Close(); cErr != nil {
a.log.WithError(cErr).Error("failed to close docker client")
}
}()
cl := docker.NewClient(dcl)
cruntime = cl
if a.opts.ContainerRuntime != options.DockerContainerRuntime {
a.log.Error(a.opts.ContainerRuntime,
" container runtime is not supported, please update runtime config to use docker instead.")
return fmt.Errorf("container runtime not available: %s", a.opts.ContainerRuntime)
}

dcl, dErr := dclient.NewClientWithOpts(dclient.WithAPIVersionNegotiation(), dclient.FromEnv)
if dErr != nil {
return fmt.Errorf("failed to build docker client: %w", dErr)
}
defer func() {
a.log.Trace("cleaning up docker client")
if cErr := dcl.Close(); cErr != nil {
a.log.WithError(cErr).Error("failed to close docker client")
}
}()
cruntime := docker.NewClient(dcl)

a.log.Trace("setting up container manager")
outbox := make(chan *aproto.MasterMessage, eventChanSize) // covers many from socket lifetimes
Expand Down
2 changes: 1 addition & 1 deletion agent/internal/container/container_runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
"github.com/docker/docker/api/types/filters"
)

// ContainerRuntime is our interface for interacting with runtimes like Docker or Singularity.
// ContainerRuntime is our interface for interacting with runtimes like Docker.
type ContainerRuntime interface {
ReattachContainer(
ctx context.Context,
Expand Down
27 changes: 4 additions & 23 deletions agent/internal/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,8 @@ type Options struct {

Security SecurityOptions `json:"security"`

Debug bool `json:"debug"`
ArtificialSlots int `json:"artificial_slots"`
ImageRoot string `json:"image_root"`
Debug bool `json:"debug"`
ArtificialSlots int `json:"artificial_slots"`

TLS bool `json:"tls"`
TLSCertFile string `json:"tls_cert"`
Expand All @@ -80,9 +79,7 @@ type Options struct {
// master config.
AgentReconnectBackoff int `json:"agent_reconnect_backoff"`

ContainerRuntime string `json:"container_runtime"`
SingularityOptions SingularityOptions `json:"singularity_options"`
PodmanOptions PodmanOptions `json:"podman_options"`
ContainerRuntime string `json:"container_runtime"`

ContainerAutoRemoveDisabled bool `json:"container_auto_remove_disabled"`

Expand Down Expand Up @@ -214,25 +211,9 @@ type ContainerRuntime string

// Available container runtimes.
const (
ApptainerContainerRuntime = "apptainer"
SingularityContainerRuntime = "singularity"
DockerContainerRuntime = "docker"
PodmanContainerRuntime = "podman"
DockerContainerRuntime = "docker"
)

// SingularityOptions configures how we interact with Singularity.
type SingularityOptions struct {
// AllowNetworkCreation allows the agent to use `singularity run`'s `--net` option, which sets
// up and launches containers into a new network namespace. Disabled by default since this
// requires root or a suid installation with /etc/subuid --fakeroot.
AllowNetworkCreation bool `json:"allow_network_creation"`
}

// PodmanOptions configures how we interact with podman.
type PodmanOptions struct {
AllowNetworkCreation bool `json:"allow_network_creation"` // review
}

// VisibleGPUsFromEnvironment returns GPU visibility information from the environment
// if any, else "".
func VisibleGPUsFromEnvironment() (visDevices string) {
Expand Down
2 changes: 0 additions & 2 deletions agent/internal/options/options_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ security:
master_cert_name: master_certificate
debug: true
artificial_slots: 12
image_root: docker_image_root
tls: true
tls_cert: tls_certificate_file
tls_key: tls_key_file
Expand Down Expand Up @@ -136,7 +135,6 @@ container_runtime: docker_runtime_env
},
Debug: true,
ArtificialSlots: 12,
ImageRoot: "docker_image_root",
TLS: true,
TLSCertFile: "tls_certificate_file",
TLSKeyFile: "tls_key_file",
Expand Down
Loading

0 comments on commit c3e0a41

Please sign in to comment.