From 1b6736308bdee70460fa1419000b0be30872538a Mon Sep 17 00:00:00 2001 From: weicong wang <90941622+weicongw@users.noreply.github.com> Date: Fri, 2 Aug 2024 11:38:20 -0700 Subject: [PATCH] Fix GetJobLogs and e2e-neuron binary not exits issue. (#465) * Fix GetJobLogs and e2e-neuron binary not exits issue. * Update ci.yaml --- .github/workflows/ci.yaml | 5 ----- Dockerfile.kubetest2 | 1 + e2e2/internal/framework_extensions/client.go | 3 ++- e2e2/test/cases/neuron/main_test.go | 2 +- e2e2/test/cases/neuron/neuron_test.go | 2 +- e2e2/test/cases/nvidia/main_test.go | 2 +- .../manifests/mpi-job-pytorch-training-single-node.yaml | 2 +- e2e2/test/cases/nvidia/mpi_test.go | 4 ++-- e2e2/test/cases/nvidia/unit_test.go | 2 +- .../eksapi/templates/unmanaged-nodegroup.yaml.template | 2 +- 10 files changed, 11 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b79a575b0..24c7b74d5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -26,11 +26,6 @@ jobs: steps: - uses: actions/checkout@v3 - run: docker build --file e2e2/test/images/nvidia/Dockerfile . - build-bert-inference: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference build-bert-training: runs-on: ubuntu-latest steps: diff --git a/Dockerfile.kubetest2 b/Dockerfile.kubetest2 index af1850554..74f36bd66 100644 --- a/Dockerfile.kubetest2 +++ b/Dockerfile.kubetest2 @@ -20,6 +20,7 @@ COPY e2e2/ . RUN go test -c ./test/cases/nvidia -o $GOPATH/bin/e2e-nvidia RUN go test -c ./test/cases/netpol -o $GOPATH/bin/e2e-netpol RUN go test -c ./test/cases/quick -o $GOPATH/bin/e2e-quick +RUN go test -c ./test/cases/neuron -o $GOPATH/bin/e2e-neuron RUN go test -c ./test/cases/inference -o $GOPATH/bin/e2e-inference FROM public.ecr.aws/amazonlinux/amazonlinux:2 diff --git a/e2e2/internal/framework_extensions/client.go b/e2e2/internal/framework_extensions/client.go index ab72b525e..2edf5df95 100644 --- a/e2e2/internal/framework_extensions/client.go +++ b/e2e2/internal/framework_extensions/client.go @@ -123,7 +123,8 @@ func RenderManifests(file []byte, templateData interface{}) ([]byte, error) { } // GetJobLogs get logs from MPIJob -func GetJobLogs(ctx context.Context, restConfig *rest.Config, job k8s.Object) (string, error) { +func GetJobLogs(restConfig *rest.Config, job k8s.Object) (string, error) { + ctx := context.Background() clientset, err := kubernetes.NewForConfig(restConfig) if err != nil { return "", err diff --git a/e2e2/test/cases/neuron/main_test.go b/e2e2/test/cases/neuron/main_test.go index 48801b189..6176bb58c 100644 --- a/e2e2/test/cases/neuron/main_test.go +++ b/e2e2/test/cases/neuron/main_test.go @@ -37,7 +37,7 @@ func TestMain(m *testing.M) { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg) - ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute) defer cancel() testenv = testenv.WithContext(ctx) diff --git a/e2e2/test/cases/neuron/neuron_test.go b/e2e2/test/cases/neuron/neuron_test.go index 0c3ad2ac8..e1b8c1afa 100644 --- a/e2e2/test/cases/neuron/neuron_test.go +++ b/e2e2/test/cases/neuron/neuron_test.go @@ -58,7 +58,7 @@ func TestMPIJobPytorchTraining(t *testing.T) { return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &batchv1.Job{ + log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "neuronx-single-node", Namespace: "default"}, }) if err != nil { diff --git a/e2e2/test/cases/nvidia/main_test.go b/e2e2/test/cases/nvidia/main_test.go index f9e4ef0f3..aa170d3ef 100644 --- a/e2e2/test/cases/nvidia/main_test.go +++ b/e2e2/test/cases/nvidia/main_test.go @@ -50,7 +50,7 @@ func TestMain(m *testing.M) { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg) - ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute) defer cancel() testenv = testenv.WithContext(ctx) diff --git a/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml b/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml index 9284ec212..1d95b4967 100644 --- a/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml +++ b/e2e2/test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml @@ -48,7 +48,7 @@ spec: - MXNET_CUDNN_AUTOTUNE_DEFAULT=0 - python - -c - - import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 3") + - import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 1") resources: limits: nvidia.com/gpu: 1 diff --git a/e2e2/test/cases/nvidia/mpi_test.go b/e2e2/test/cases/nvidia/mpi_test.go index 71b2055dc..d697e47a8 100644 --- a/e2e2/test/cases/nvidia/mpi_test.go +++ b/e2e2/test/cases/nvidia/mpi_test.go @@ -62,7 +62,7 @@ func TestMPIJobPytorchTraining(t *testing.T) { return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{ + log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{ ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-single-node", Namespace: "default"}, }) if err != nil { @@ -125,7 +125,7 @@ func TestMPIJobPytorchTraining(t *testing.T) { return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{ + log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{ ObjectMeta: metav1.ObjectMeta{Name: "multi-node-nccl-test", Namespace: "default"}, }) if err != nil { diff --git a/e2e2/test/cases/nvidia/unit_test.go b/e2e2/test/cases/nvidia/unit_test.go index a54ae3da4..235cae31f 100644 --- a/e2e2/test/cases/nvidia/unit_test.go +++ b/e2e2/test/cases/nvidia/unit_test.go @@ -58,7 +58,7 @@ func TestSingleNodeUnitTest(t *testing.T) { return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { - log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &batchv1.Job{ + log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"}, }) if err != nil { diff --git a/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template b/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template index fd476b680..9ee285da1 100644 --- a/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template +++ b/kubetest2/internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template @@ -24,7 +24,7 @@ Parameters: NodeDiskSize: Type: Number Description: Node disk size in gigabytes. - Default: 20 + Default: 100 NodeCount: Type: Number