Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix GetJobLogs and e2e-neuron binary not exits issue. #465

Merged
merged 3 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,6 @@ jobs:
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/nvidia/Dockerfile .
build-bert-inference:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/bert-inference/Dockerfile e2e2/test/images/bert-inference
build-bert-training:
runs-on: ubuntu-latest
steps:
Expand Down
1 change: 1 addition & 0 deletions Dockerfile.kubetest2
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ COPY e2e2/ .
RUN go test -c ./test/cases/nvidia -o $GOPATH/bin/e2e-nvidia
RUN go test -c ./test/cases/netpol -o $GOPATH/bin/e2e-netpol
RUN go test -c ./test/cases/quick -o $GOPATH/bin/e2e-quick
RUN go test -c ./test/cases/neuron -o $GOPATH/bin/e2e-neuron
RUN go test -c ./test/cases/inference -o $GOPATH/bin/e2e-inference

FROM public.ecr.aws/amazonlinux/amazonlinux:2
Expand Down
3 changes: 2 additions & 1 deletion e2e2/internal/framework_extensions/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@ func RenderManifests(file []byte, templateData interface{}) ([]byte, error) {
}

// GetJobLogs get logs from MPIJob
func GetJobLogs(ctx context.Context, restConfig *rest.Config, job k8s.Object) (string, error) {
func GetJobLogs(restConfig *rest.Config, job k8s.Object) (string, error) {
ctx := context.Background()
Issacwww marked this conversation as resolved.
Show resolved Hide resolved
clientset, err := kubernetes.NewForConfig(restConfig)
if err != nil {
return "", err
Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/neuron/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func TestMain(m *testing.M) {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute)
defer cancel()
testenv = testenv.WithContext(ctx)

Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/neuron/neuron_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &batchv1.Job{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "neuronx-single-node", Namespace: "default"},
})
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ func TestMain(m *testing.M) {
log.Fatalf("failed to initialize test environment: %v", err)
}
testenv = env.NewWithConfig(cfg)
ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute)
Issacwww marked this conversation as resolved.
Show resolved Hide resolved
defer cancel()
testenv = testenv.WithContext(ctx)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ spec:
- MXNET_CUDNN_AUTOTUNE_DEFAULT=0
- python
- -c
- import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 3")
- import os; os.system("git clone https://github.com/pytorch/examples.git /pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python /pytorch-examples/mnist/main.py --epochs 1")
Issacwww marked this conversation as resolved.
Show resolved Hide resolved
resources:
limits:
nvidia.com/gpu: 1
4 changes: 2 additions & 2 deletions e2e2/test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
ObjectMeta: metav1.ObjectMeta{Name: "pytorch-training-single-node", Namespace: "default"},
})
if err != nil {
Expand Down Expand Up @@ -125,7 +125,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &kubeflowv2beta1.MPIJob{
ObjectMeta: metav1.ObjectMeta{Name: "multi-node-nccl-test", Namespace: "default"},
})
if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func TestSingleNodeUnitTest(t *testing.T) {
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
log, err := fwext.GetJobLogs(ctx, cfg.Client().RESTConfig(), &batchv1.Job{
log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"},
})
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Parameters:
NodeDiskSize:
Type: Number
Description: Node disk size in gigabytes.
Default: 20
Default: 100
Issacwww marked this conversation as resolved.
Show resolved Hide resolved

NodeCount:
Type: Number
Expand Down
Loading