Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test case for unit test and delete the duplicated docker file. #457

Merged
merged 1 commit into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@ jobs:
steps:
- uses: actions/checkout@v3
- run: docker build --build-arg=KUBERNETES_MINOR_VERSION=latest --file Dockerfile.kubetest2 .
build-nccl:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/Dockerfile.aws-efa-nccl-tests .
build-neuronx:
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/neuron/neuron_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
WithLabel("hardware", "gpu").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
if *neuronTestImage == "" {
t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.neuronx-tests to build the image and -neuronTestImage to set the image url"))
t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/neuron/Dockerfile to build the image and -neuronTestImage to set the image url"))
}
var err error
renderedNeuronSingleNodeManifest, err = fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{
Expand Down
16 changes: 8 additions & 8 deletions e2e2/test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ import (
)

var (
testenv env.Environment
nodeType *string
efaEnabled *bool
ncclTestImage *string
nodeCount int
gpuPerNode int
efaPerNode int
testenv env.Environment
nodeType *string
efaEnabled *bool
nvidiaTestImage *string
nodeCount int
gpuPerNode int
efaPerNode int
)

var (
Expand All @@ -43,7 +43,7 @@ var (

func TestMain(m *testing.M) {
nodeType = flag.String("nodeType", "", "node type for the tests")
ncclTestImage = flag.String("ncclTestImage", "", "nccl test image for nccl tests")
nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
cfg, err := envconf.NewFromFlags()
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
spec:
restartPolicy: OnFailure
containers:
- image: {{.NcclTestImage}}
- image: {{.NvidiaTestImage}}
imagePullPolicy: Always
name: nccl-test-launcher
env:
Expand Down Expand Up @@ -82,7 +82,7 @@ spec:
emptyDir:
medium: Memory
containers:
- image: {{.NcclTestImage}}
- image: {{.NvidiaTestImage}}
imagePullPolicy: Always
name: nccl-test-worker
volumeMounts:
Expand Down
8 changes: 4 additions & 4 deletions e2e2/test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type ncclTestManifestTplVars struct {
WorkerNodeCount int
WorkerNodeGpuCount int
GpuPerNode int
NcclTestImage string
NvidiaTestImage string
EfaInterfacePerNode int
EfaUseDeviceRdma int
}
Expand Down Expand Up @@ -77,8 +77,8 @@ func TestMPIJobPytorchTraining(t *testing.T) {
WithLabel("hardware", "gpu").
WithLabel("hardware", "efa").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
if *ncclTestImage == "" {
t.Fatal(fmt.Errorf("efaImage must be set to run nccl test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.aws-efa-nccl-tests to build the image and -efaImage to set the image url"))
if *nvidiaTestImage == "" {
t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url"))
}
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl-base.html#nccl-start-base-test
var EfaUseDeviceRdma int
Expand All @@ -90,7 +90,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
WorkerNodeCount: nodeCount - 1,
WorkerNodeGpuCount: (nodeCount - 1) * gpuPerNode,
GpuPerNode: gpuPerNode,
NcclTestImage: *ncclTestImage,
NvidiaTestImage: *nvidiaTestImage,
EfaInterfacePerNode: efaPerNode,
EfaUseDeviceRdma: EfaUseDeviceRdma,
})
Expand Down
71 changes: 71 additions & 0 deletions e2e2/test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package nvidia

import (
"context"
_ "embed"
"fmt"
"testing"
"time"

fwext "github.com/aws/aws-k8s-tester/e2e2/internal/framework_extensions"
"sigs.k8s.io/e2e-framework/klient/wait"
"sigs.k8s.io/e2e-framework/pkg/envconf"
"sigs.k8s.io/e2e-framework/pkg/features"

batchv1 "k8s.io/api/batch/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var (
//go:embed manifests/job-unit-test-single-node.yaml
jobUnitTestSingleNodeManifest []byte
renderedJobUnitTestSingleNodeManifest []byte
)

type unitTestManifestTplVars struct {
NvidiaTestImage string
}

func TestSingleNodeUnitTest(t *testing.T) {
unitTest := features.New("unit-test").
WithLabel("suite", "nvidia").
WithLabel("hardware", "gpu").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
if *nvidiaTestImage == "" {
t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url"))
}
var err error
renderedJobUnitTestSingleNodeManifest, err = fwext.RenderManifests(jobUnitTestSingleNodeManifest, unitTestManifestTplVars{
NvidiaTestImage: *nvidiaTestImage,
})
if err != nil {
t.Fatal(err)
}
err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest)
if err != nil {
t.Fatal(err)
}
return ctx
}).
Assess("Unit test Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"},
}
err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithTimeout(time.Minute*20))
if err != nil {
t.Fatal(err)
}
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest)
if err != nil {
t.Fatal(err)
}
return ctx
}).
Feature()

testenv.Test(t, unitTest)
}
93 changes: 0 additions & 93 deletions e2e2/test/images/Dockerfile.aws-efa-nccl-tests

This file was deleted.

Loading