Skip to content

Commit

Permalink
Fix GetJobLogs and e2e-neuron binary not exits issue. (#465)
Browse files Browse the repository at this point in the history
* Fix GetJobLogs and e2e-neuron binary not exits issue.

* Update ci.yaml
  • Loading branch information
weicongw authored Aug 2, 2024
1 parent b133519 commit 1b67363
Show file tree
Hide file tree
Showing 10 changed files with 11 additions and 14 deletions.
5 changes: 0 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,6 @@ jobs:
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/nvidia/Dockerfile .
build-bert-inference:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference
build-bert-training:
runs-on: ubuntu-latest
steps:
Expand Down
1 change: 1 addition & 0 deletions Dockerfile.kubetest2
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ COPY e2e2/ .
RUN go test -c ./test/cases/nvidia -o $GOPATH/bin/e2e-nvidia
RUN go test -c ./test/cases/netpol -o $GOPATH/bin/e2e-netpol
RUN go test -c ./test/cases/quick -o $GOPATH/bin/e2e-quick
RUN go test -c ./test/cases/neuron -o $GOPATH/bin/e2e-neuron
RUN go test -c ./test/cases/inference -o $GOPATH/bin/e2e-inference

FROM public.ecr.aws/amazonlinux/amazonlinux:2
Expand Down
3 changes: 2 additions & 1 deletion e2e2/internal/framework_extensions/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ func RenderManifests(file []byte, templateData interface{}) ([]byte, error) {
}

// GetJobLogs get logs from MPIJob
func GetJobLogs(ctx context.Context, restConfig *rest.Config, job k8s.Object) (string, error) {
func GetJobLogs(restConfig *rest.Config, job k8s.Object) (string, error) {
ctx := context.Background()
clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return "", err
Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/neuron/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func TestMain(m *testing.M) {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute)
defer cancel()
testenv = testenv.WithContext(ctx)

Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/neuron/neuron_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &batchv1.Job{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "neuronx-single-node", Namespace: "default"},
})
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func TestMain(m *testing.M) {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute)
defer cancel()
testenv = testenv.WithContext(ctx)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ spec:
- MXNET_CUDNN_AUTOTUNE_DEFAULT=0
- python
- -c
- import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 3")
- import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 1")
resources:
limits:
nvidia.com/gpu: 1
4 changes: 2 additions & 2 deletions e2e2/test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-single-node", Namespace: "default"},
})
if err != nil {
Expand Down Expand Up @@ -125,7 +125,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
ObjectMeta: metav1.ObjectMeta{Name: "multi-node-nccl-test", Namespace: "default"},
})
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func TestSingleNodeUnitTest(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &batchv1.Job{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"},
})
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Parameters:
NodeDiskSize:
Type: Number
Description: Node disk size in gigabytes.
Default: 20
Default: 100

NodeCount:
Type: Number
Expand Down

0 comments on commit 1b67363

Please sign in to comment.