-
Notifications
You must be signed in to change notification settings - Fork 728
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add E2E test for gang-scheduling #1736
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,17 +2,24 @@ name: integration test | |
on: | ||
- pull_request | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
|
||
jobs: | ||
integration-test: | ||
runs-on: ubuntu-latest | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
kubernetes-version: ["v1.23.12", "v1.24.6", "v1.25.2"] | ||
# TODO (tenzen-y): Add volcano. | ||
gang-scheduler-name: ["none", "scheduler-plugins"] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea to extend to volcano in the future There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we should run E2E with volcano. Although, we can follow up on other PRs. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I created #1738. |
||
|
||
steps: | ||
- name: Checkout | ||
uses: actions/checkout@v3 | ||
|
||
- name: Create k8s Kind Cluster | ||
uses: helm/[email protected] | ||
with: | ||
|
@@ -25,15 +32,19 @@ jobs: | |
./scripts/gha/build-image.sh | ||
env: | ||
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test | ||
|
||
- name: Deploy training operator | ||
run: | | ||
./scripts/gha/setup-training-operator.sh | ||
env: | ||
KIND_CLUSTER: training-operator-cluster | ||
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test | ||
GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} | ||
KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} | ||
|
||
- name: Run tests | ||
run: | | ||
pip install pytest | ||
python3 -m pip install -e sdk/python; pytest sdk/python/test --log-cli-level=info | ||
env: | ||
GANG_SCHEDULER_NAME: ${{ matrix.gang-scheduler-name }} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -752,12 +752,12 @@ def list_tfjobs( | |
|
||
Args: | ||
namespace: Namespace to list the TFJobs. | ||
timeout: Optional, Kubernetes API server timeout in seconds | ||
to execute the request. | ||
|
||
Returns: | ||
list[KubeflowOrgV1TFJob]: List of TFJobs objects. It returns | ||
empty list if TFJobs cannot be found. | ||
timeout: Optional, Kubernetes API server timeout in seconds | ||
to execute the request. | ||
|
||
Raises: | ||
TimeoutError: Timeout to list TFJobs. | ||
|
@@ -802,6 +802,33 @@ def delete_tfjob( | |
delete_options=delete_options, | ||
) | ||
|
||
def patch_tfjob( | ||
self, | ||
tfjob: models.KubeflowOrgV1TFJob, | ||
name: str, | ||
namespace: str = utils.get_default_target_namespace(), | ||
andreyvelich marked this conversation as resolved.
Show resolved
Hide resolved
|
||
): | ||
"""Patch the TFJob. | ||
|
||
Args: | ||
tfjob: TFJob object of type KubeflowOrgV1TFJob to patch. | ||
name: Name for the TFJob. | ||
namespace: Namespace for the TFJob. | ||
|
||
Raises: | ||
TimeoutError: Timeout to patch TFJob. | ||
RuntimeError: Failed to patch TFJob. | ||
""" | ||
|
||
return utils.patch_job( | ||
custom_api=self.custom_api, | ||
job=tfjob, | ||
name=name, | ||
namespace=namespace, | ||
job_kind=constants.TFJOB_KIND, | ||
job_plural=constants.TFJOB_PLURAL, | ||
) | ||
|
||
# ------------------------------------------------------------------------ # | ||
# PyTorchJob Training Client APIs. | ||
# ------------------------------------------------------------------------ # | ||
|
@@ -1000,6 +1027,33 @@ def delete_pytorchjob( | |
delete_options=delete_options, | ||
) | ||
|
||
def patch_pytorchjob( | ||
self, | ||
pytorchjob: models.KubeflowOrgV1PyTorchJob, | ||
name: str, | ||
namespace: str = utils.get_default_target_namespace(), | ||
): | ||
"""Patch the PyTorchJob. | ||
|
||
Args: | ||
pytorchjob: PyTorchJob object of type KubeflowOrgV1PyTorchJob. | ||
name: Name for the PyTorchJob. | ||
namespace: Namespace for the PyTorchJob. | ||
|
||
Raises: | ||
TimeoutError: Timeout to patch PyTorchJob. | ||
RuntimeError: Failed to patch PyTorchJob. | ||
""" | ||
|
||
return utils.patch_job( | ||
custom_api=self.custom_api, | ||
job=pytorchjob, | ||
name=name, | ||
namespace=namespace, | ||
job_kind=constants.PYTORCHJOB_KIND, | ||
job_plural=constants.PYTORCHJOB_PLURAL, | ||
) | ||
|
||
# ------------------------------------------------------------------------ # | ||
# MXJob Training Client APIs. | ||
# ------------------------------------------------------------------------ # | ||
|
@@ -1044,6 +1098,8 @@ def get_mxjob( | |
Args: | ||
name: Name for the MXJob. | ||
namespace: Namespace for the MXJob. | ||
timeout: Optional, Kubernetes API server timeout in seconds | ||
to execute the request. | ||
|
||
Returns: | ||
KubeflowOrgV1MXJob: MXJob object. | ||
|
@@ -1123,6 +1179,33 @@ def delete_mxjob( | |
delete_options=delete_options, | ||
) | ||
|
||
def patch_mxjob( | ||
self, | ||
mxjob: models.KubeflowOrgV1MXJob, | ||
name: str, | ||
namespace: str = utils.get_default_target_namespace(), | ||
): | ||
"""Patch the MXJob. | ||
|
||
Args: | ||
mxjob: MXJob object of type KubeflowOrgV1MXJob. | ||
name: Name for the MXJob. | ||
namespace: Namespace for the MXJob. | ||
|
||
Raises: | ||
TimeoutError: Timeout to patch MXJob. | ||
RuntimeError: Failed to patch MXJob. | ||
""" | ||
|
||
return utils.patch_job( | ||
custom_api=self.custom_api, | ||
job=mxjob, | ||
name=name, | ||
namespace=namespace, | ||
job_kind=constants.MXJOB_KIND, | ||
job_plural=constants.MXJOB_PLURAL, | ||
) | ||
|
||
# ------------------------------------------------------------------------ # | ||
# XGBoostJob Training Client APIs. | ||
# ------------------------------------------------------------------------ # | ||
|
@@ -1248,6 +1331,33 @@ def delete_xgboostjob( | |
delete_options=delete_options, | ||
) | ||
|
||
def patch_xgboostjob( | ||
self, | ||
xgboostjob: models.KubeflowOrgV1XGBoostJob, | ||
name: str, | ||
namespace: str = utils.get_default_target_namespace(), | ||
): | ||
"""Patch the XGBoostJob. | ||
|
||
Args: | ||
xgboostjob: XGBoostJob object of type KubeflowOrgV1XGBoostJob. | ||
name: Name for the XGBoostJob. | ||
namespace: Namespace for the XGBoostJob. | ||
|
||
Raises: | ||
TimeoutError: Timeout to patch XGBoostJob. | ||
RuntimeError: Failed to patch XGBoostJob. | ||
""" | ||
|
||
return utils.patch_job( | ||
custom_api=self.custom_api, | ||
job=xgboostjob, | ||
name=name, | ||
namespace=namespace, | ||
job_kind=constants.XGBOOSTJOB_KIND, | ||
job_plural=constants.XGBOOSTJOB_PLURAL, | ||
) | ||
|
||
# ------------------------------------------------------------------------ # | ||
# MPIJob Training Client APIs. | ||
# ------------------------------------------------------------------------ # | ||
|
@@ -1323,12 +1433,12 @@ def list_mpijobs( | |
|
||
Args: | ||
namespace: Namespace to list the MPIJobs. | ||
timeout: Optional, Kubernetes API server timeout in seconds | ||
to execute the request. | ||
|
||
Returns: | ||
list[KubeflowOrgV1MPIJob]: List of MPIJobs objects. It returns | ||
empty list if MPIJobs cannot be found. | ||
timeout: Optional, Kubernetes API server timeout in seconds | ||
to execute the request. | ||
Comment on lines
1434
to
-1331
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice catch! |
||
|
||
Raises: | ||
TimeoutError: Timeout to list MPIJobs. | ||
|
@@ -1373,6 +1483,33 @@ def delete_mpijob( | |
delete_options=delete_options, | ||
) | ||
|
||
def patch_mpijob( | ||
self, | ||
mpijob: models.KubeflowOrgV1MPIJob, | ||
name: str, | ||
namespace: str = utils.get_default_target_namespace(), | ||
): | ||
"""Patch the MPIJob. | ||
|
||
Args: | ||
mpijob: MPIJob object of type KubeflowOrgV1MPIJob. | ||
name: Name for the MPIJob. | ||
namespace: Namespace for the MPIJob. | ||
|
||
Raises: | ||
TimeoutError: Timeout to patch MPIJob. | ||
RuntimeError: Failed to patch MPIJob. | ||
""" | ||
|
||
return utils.patch_job( | ||
custom_api=self.custom_api, | ||
job=mpijob, | ||
name=name, | ||
namespace=namespace, | ||
job_kind=constants.MPIJOB_KIND, | ||
job_plural=constants.MPIJOB_PLURAL, | ||
) | ||
|
||
# ------------------------------------------------------------------------ # | ||
# PaddleJob Training Client APIs. | ||
# ------------------------------------------------------------------------ # | ||
|
@@ -1417,6 +1554,8 @@ def get_paddlejob( | |
Args: | ||
name: Name for the PaddleJob. | ||
namespace: Namespace for the PaddleJob. | ||
timeout: Optional, Kubernetes API server timeout in seconds | ||
to execute the request. | ||
|
||
Returns: | ||
KubeflowOrgV1PaddleJob: PaddleJob object. | ||
|
@@ -1495,3 +1634,30 @@ def delete_paddlejob( | |
job_plural=constants.PADDLEJOB_PLURAL, | ||
delete_options=delete_options, | ||
) | ||
|
||
def patch_paddlejob( | ||
self, | ||
paddlejob: models.KubeflowOrgV1PaddleJob, | ||
name: str, | ||
namespace: str = utils.get_default_target_namespace(), | ||
): | ||
"""Patch the PaddleJob. | ||
|
||
Args: | ||
paddlejob: PaddleJob object of type KubeflowOrgV1PaddleJob. | ||
name: Name for the PaddleJob. | ||
namespace: Namespace for the PaddleJob. | ||
|
||
Raises: | ||
TimeoutError: Timeout to patch PaddleJob. | ||
RuntimeError: Failed to patch PaddleJob. | ||
""" | ||
|
||
return utils.patch_job( | ||
custom_api=self.custom_api, | ||
job=paddlejob, | ||
name=name, | ||
namespace=namespace, | ||
job_kind=constants.PADDLEJOB_KIND, | ||
job_plural=constants.PADDLEJOB_PLURAL, | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding these parameters, when we push commits to the same PR, the already running Jobs are forcefully stopped, and new Jobs are started.
So we can reduce CI run times.