determined-ai · rb-determined-ai · Oct 10, 2024 · Oct 4, 2024 · azhou-determined · Oct 4, 2024
@@ -4208,18 +4208,6 @@ workflows:
             security:
               initial_user_password: ${INITIAL_USER_PASSWORD}
 
-      - test-e2e-slurm:
-          name: test-e2e-slurm-gpu
-          mark: "e2e_slurm_gpu"
-          requires:
-            - package-and-push-system-local-ee
-          context:
-            - dev-ci-cluster-default-user-credentials
-          filters:
-            branches:
-              only:
-                - main
-
       # Singularity over SLURM test on GCP
       - test-e2e-hpc-gcp:
           context:
@@ -5106,16 +5094,6 @@ workflows:
           extra-pytest-flags: "--no-compare-stats"
           collect-det-job-logs: false
 
-      - test-e2e-slurm:
-          name: test-e2e-slurm-gpu
-          context:
-            - dev-ci-cluster-default-user-credentials
-          filters: *upstream-feature-branch
-          mark: "e2e_slurm_gpu"
-          requires:
-            - package-and-push-system-local-ee
-            - request-hpc-tests
-
       - test-e2e-slurm:
           name: test-e2e-slurm-enroot-znode
           context:

diff --git a/e2e_tests/pytest.ini b/e2e_tests/pytest.ini
@@ -20,7 +20,6 @@ markers =
     e2e_pbs: end to end pbs integration tests
     e2e_saml: tests for saml with okta
     e2e_slurm: end to end slurm integration tests
-    e2e_slurm_gpu: end to end slurm GPU tests
     e2e_slurm_restart: slurm integration tests that require restarting the master
     e2e_slurm_preemption: hpc integration test to ensure preemption is working
     e2e_slurm_internet_connected_cluster: slurm integrations for clusters with internet access

@@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None:
 
 
 @pytest.mark.e2e_gpu
-@pytest.mark.e2e_slurm_gpu
 def test_set_gc_policy() -> None:
     sess = api_utils.user_session()
     save_exp_best = 3
@@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None:
 
 
 @pytest.mark.e2e_cpu
+@pytest.mark.e2e_slurm
 def test_delete_checkpoints() -> None:
     sess = api_utils.user_session()
     config = {

@@ -12,7 +12,6 @@
 
 
 @pytest.mark.e2e_gpu
-@pytest.mark.e2e_slurm_gpu
 @pytest.mark.timeout(30 * 60)
 def test_streaming_observability_metrics_apis() -> None:
     sess = api_utils.user_session()

diff --git a/e2e_tests/tests/fixtures/pytorch_identity/__init__.py b/e2e_tests/tests/fixtures/pytorch_identity/__init__.py
diff --git a/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml b/e2e_tests/tests/fixtures/pytorch_identity/distributed.yaml
diff --git a/e2e_tests/tests/fixtures/pytorch_identity/model_def.py b/e2e_tests/tests/fixtures/pytorch_identity/model_def.py
@@ -7,14 +7,13 @@
 
 @pytest.mark.distributed
 @pytest.mark.gpu_required
-@pytest.mark.e2e_slurm_gpu
 def test_pytorch2_hf_language_modeling_distributed() -> None:
     sess = api_utils.user_session()
     test_dir = "hf_language_modeling"
 
     config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml"))
     config = conf.set_pt2_image(config)
-    config = conf.set_slots_per_trial(config, 4)
+    config = conf.set_slots_per_trial(config, 8)
 
     # Our hardware GPUs have only 16gb memory, lower memory use with smaller batches.
     config = conf.set_entrypoint(

@@ -0,0 +1,79 @@
+from typing import Any, Dict, Tuple
+
+import torch.utils.data
+
+from determined import pytorch
+
+
+class MetricsCallback(pytorch.PyTorchCallback):
+    def __init__(self):
+        self.validation_metrics = []
+
+    def on_validation_end(self, metrics: Dict[str, Any]) -> None:
+        self.validation_metrics.append(metrics)
+
+
+class IdentityDataset(torch.utils.data.Dataset):
+    def __init__(self, initial_value: int = 1):
+        self.initial_value = initial_value
+
+    def __len__(self) -> int:
+        return 64
+
+    def __getitem__(self, index: int) -> Tuple:
+        v = float(self.initial_value + 0.1 * index)
+        return torch.Tensor([v]), torch.Tensor([v])
+
+
+class IdentityPyTorchTrial(pytorch.PyTorchTrial):
+    def __init__(self, context: pytorch.PyTorchTrialContext) -> None:
+        self.context = context
+
+        model = torch.nn.Linear(1, 1, False)
+        model.weight.data.fill_(0)
+        self.model = context.wrap_model(model)
+
+        self.lr = 0.001
+
+        optimizer = torch.optim.SGD(self.model.parameters(), self.lr)
+        self.opt = context.wrap_optimizer(optimizer)
+
+        self.loss_fn = torch.nn.MSELoss(reduction="mean")
+        self.metrics_callback = MetricsCallback()
+
+    def train_batch(
+        self, batch: pytorch.TorchData, epoch_idx: int, batch_idx: int
+    ) -> Dict[str, torch.Tensor]:
+        data, label = batch
+
+        loss = self.loss_fn(self.model(data), label)
+
+        self.context.backward(loss)
+
+        self.context.step_optimizer(self.opt)
+
+        return {
+            "loss": loss,
+        }
+
+    def evaluate_batch(self, batch: pytorch.TorchData) -> Dict[str, Any]:
+        data, label = batch
+
+        loss = self.loss_fn(self.model(data), label)
+
+        weight = self.model.weight.data.item()
+
+        return {"val_loss": loss, "weight": weight}
+
+    def build_training_data_loader(self) -> pytorch.DataLoader:
+        return pytorch.DataLoader(
+            IdentityDataset(), batch_size=self.context.get_per_slot_batch_size()
+        )
+
+    def build_validation_data_loader(self) -> pytorch.DataLoader:
+        return pytorch.DataLoader(
+            IdentityDataset(20), batch_size=self.context.get_per_slot_batch_size()
+        )
+
+    def build_callbacks(self) -> Dict[str, pytorch.PyTorchCallback]:
+        return {"metrics": self.metrics_callback}
@@ -1468,16 +1468,14 @@ def amp_metrics_test(trial_class, training_metrics, agg_freq=1):
 def run_identity(tmp_path: pathlib.Path):
     checkpoint_dir = str(tmp_path.joinpath("checkpoint"))
 
-    config = utils.load_config(utils.fixtures_path("pytorch_identity/distributed.yaml"))
-    hparams = config["hyperparameters"]
+    hparams = {"global_batch_size": 4}
 
     exp_config = utils.make_default_exp_config(
         hparams,
         scheduling_unit=1,
         searcher_metric="validation_loss",
         checkpoint_dir=checkpoint_dir,
     )
-    exp_config.update(config)
     exp_config["searcher"]["smaller_is_better"] = True
 
     # each subprocess must import separately as trial_class cannot be pickled.

diff --git a/tools/slurm/README.md b/tools/slurm/README.md
@@ -149,7 +149,6 @@ By default, the `test-e2e-*-gcp` jobs are not run within the `test-e2e` workflow
 **On branch `main` and `release/rc` branches, these jobs always run without needing to set the `ci-run-allgcp` label.**
 
 The following test suites currently run only on hardware. They do not run successfully with `make slurmcluster` and thus are not executed via GCP as part of the CI/CD gate:
-  - `test-e2e-slurm-gpu`: Test is skipped because the compute instance that the tests run on do not have any GPUs.
   - `test-e2e-slurm-misconfigured`: This test could be made to work, but requires passing in a misconfigured `master.yaml` to the launcher on GCP, which could be tedious.
   - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
   - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.