Skip to content

Commit

Permalink
Add test case for unit test and delete the duplicated docker file.
Browse files Browse the repository at this point in the history
  • Loading branch information
weicongw committed Jul 10, 2024
1 parent 5ae025d commit 5da3c97
Show file tree
Hide file tree
Showing 7 changed files with 86 additions and 113 deletions.
5 changes: 0 additions & 5 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,6 @@ jobs:
steps:
- uses: actions/checkout@v3
- run: docker build --build-arg=KUBERNETES_MINOR_VERSION=latest --file Dockerfile.kubetest2 .
build-nccl:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- run: docker build --file e2e2/test/images/Dockerfile.aws-efa-nccl-tests .
build-neuronx:
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion e2e2/test/cases/neuron/neuron_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
WithLabel("hardware", "gpu").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
if *neuronTestImage == "" {
t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.neuronx-tests to build the image and -neuronTestImage to set the image url"))
t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile to build the image and -neuronTestImage to set the image url"))
}
var err error
renderedNeuronSingleNodeManifest, err = fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{
Expand Down
16 changes: 8 additions & 8 deletions e2e2/test/cases/nvidia/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@ import (
)

var (
testenv env.Environment
nodeType *string
efaEnabled *bool
ncclTestImage *string
nodeCount int
gpuPerNode int
efaPerNode int
testenv env.Environment
nodeType *string
efaEnabled *bool
nvidiaTestImage *string
nodeCount int
gpuPerNode int
efaPerNode int
)

var (
Expand All @@ -43,7 +43,7 @@ var (

func TestMain(m *testing.M) {
nodeType = flag.String("nodeType", "", "node type for the tests")
ncclTestImage = flag.String("ncclTestImage", "", "nccl test image for nccl tests")
nvidiaTestImage = flag.String("nvidiaTestImage", "", "nccl test image for nccl tests")
efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests")
cfg, err := envconf.NewFromFlags()
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
spec:
restartPolicy: OnFailure
containers:
- image: {{.NcclTestImage}}
- image: {{.NvidiaTestImage}}
imagePullPolicy: Always
name: nccl-test-launcher
env:
Expand Down Expand Up @@ -82,7 +82,7 @@ spec:
emptyDir:
medium: Memory
containers:
- image: {{.NcclTestImage}}
- image: {{.NvidiaTestImage}}
imagePullPolicy: Always
name: nccl-test-worker
volumeMounts:
Expand Down
8 changes: 4 additions & 4 deletions e2e2/test/cases/nvidia/mpi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type ncclTestManifestTplVars struct {
WorkerNodeCount int
WorkerNodeGpuCount int
GpuPerNode int
NcclTestImage string
NvidiaTestImage string
EfaInterfacePerNode int
EfaUseDeviceRdma int
}
Expand Down Expand Up @@ -77,8 +77,8 @@ func TestMPIJobPytorchTraining(t *testing.T) {
WithLabel("hardware", "gpu").
WithLabel("hardware", "efa").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
if *ncclTestImage == "" {
t.Fatal(fmt.Errorf("efaImage must be set to run nccl test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/Dockerfile.aws-efa-nccl-tests to build the image and -efaImage to set the image url"))
if *nvidiaTestImage == "" {
t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url"))
}
// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa-start-nccl-base.html#nccl-start-base-test
var EfaUseDeviceRdma int
Expand All @@ -90,7 +90,7 @@ func TestMPIJobPytorchTraining(t *testing.T) {
WorkerNodeCount: nodeCount - 1,
WorkerNodeGpuCount: (nodeCount - 1) * gpuPerNode,
GpuPerNode: gpuPerNode,
NcclTestImage: *ncclTestImage,
NvidiaTestImage: *nvidiaTestImage,
EfaInterfacePerNode: efaPerNode,
EfaUseDeviceRdma: EfaUseDeviceRdma,
})
Expand Down
71 changes: 71 additions & 0 deletions e2e2/test/cases/nvidia/unit_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package nvidia

import (
"context"
_ "embed"
"fmt"
"testing"
"time"

fwext "github.com/aws/aws-k8s-tester/e2e2/internal/framework_extensions"
"sigs.k8s.io/e2e-framework/klient/wait"
"sigs.k8s.io/e2e-framework/pkg/envconf"
"sigs.k8s.io/e2e-framework/pkg/features"

batchv1 "k8s.io/api/batch/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

var (
//go:embed manifests/job-unit-test-single-node.yaml
jobUnitTestSingleNodeManifest []byte
renderedJobUnitTestSingleNodeManifest []byte
)

type unitTestManifestTplVars struct {
NvidiaTestImage string
}

func TestSingleNodeUnitTest(t *testing.T) {
unitTest := features.New("unit-test").
WithLabel("suite", "nvidia").
WithLabel("hardware", "gpu").
Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
if *nvidiaTestImage == "" {
t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/e2e2/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url"))
}
var err error
renderedJobUnitTestSingleNodeManifest, err = fwext.RenderManifests(jobUnitTestSingleNodeManifest, unitTestManifestTplVars{
NvidiaTestImage: *nvidiaTestImage,
})
if err != nil {
t.Fatal(err)
}
err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest)
if err != nil {
t.Fatal(err)
}
return ctx
}).
Assess("Unit test Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"},
}
err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
wait.WithTimeout(time.Minute*20))
if err != nil {
t.Fatal(err)
}
return ctx
}).
Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest)
if err != nil {
t.Fatal(err)
}
return ctx
}).
Feature()

testenv.Test(t, unitTest)
}
93 changes: 0 additions & 93 deletions e2e2/test/images/Dockerfile.aws-efa-nccl-tests

This file was deleted.

0 comments on commit 5da3c97

Please sign in to comment.