build(deps): update pytorch-lightning requirement from <2.0.0,>=1.9.0…

… to >=1.9.0,<2.1 in /requirements (#2175) * build(deps): update pytorch-lightning requirement in /requirements Updates the requirements on [pytorch-lightning](https://github.com/Lightning-AI/lightning) to permit the latest version. - [Release notes](https://github.com/Lightning-AI/lightning/releases) - [Commits](Lightning-AI/pytorch-lightning@1.6.0...2.1.0) --- updated-dependencies: - dependency-name: pytorch-lightning dependency-type: direct:production ... Signed-off-by: dependabot[bot] <[email protected]> * fix for v2.0 or higher * <2.1.0 * fix seed * Apply suggestions from code review CI would run only with single GPU --------- Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: SkafteNicki <[email protected]> Co-authored-by: Jirka Borovec <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka <[email protected]> (cherry picked from commit 25bf7a6)
Lightning-AI · May 15, 2024 · e5a00a2 · e5a00a2
1 parent df45079
commit e5a00a2
Show file tree

Hide file tree

Showing 5 changed files with 34 additions and 12 deletions.
diff --git a/.azure/gpu-integrations.yml b/.azure/gpu-integrations.yml
@@ -44,7 +44,12 @@ jobs:
       clean: all
     steps:
       - bash: |
-          echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+          set -ex
+          devices=$(DEVICES)
+          # overwrite and use only single device
+          device=${devices%,*}
+          echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$device"
+          # nvcc --version  # FIXME!
           CUDA_version=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p')
           CUDA_version_mm="${CUDA_version//'.'/''}"
           echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm"
@@ -92,11 +97,10 @@ jobs:
           set -e
           pip list
           python -c "from torch import __version__ as ver ; assert '.'.join(str(ver).split('.')[:2]) == '$(torch-ver)', f'PyTorch: {ver}'"
-          python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 2, f'found GPUs: {mgpu}'"
+          python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu >= 1, f'found GPUs: {mgpu}'"
         displayName: "Sanity check"
 
-      - bash: python -m pytest integrations -v --durations=25
-        env:
-          PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: "python"
-        workingDirectory: tests
+      - bash: pytest . -v --durations=0 --timeout=360
+        workingDirectory: "tests/integrations/"
+        timeoutInMinutes: "15"
         displayName: "Test integrations"
diff --git a/.azure/gpu-unittests.yml b/.azure/gpu-unittests.yml
@@ -63,7 +63,9 @@ jobs:
 
     steps:
       - bash: |
+          set -ex
           echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
+          # nvcc --version  # FIXME!
           CUDA_version=$(nvcc --version | sed -n 's/^.*release \([0-9]\+\.[0-9]\+\).*$/\1/p')
           CUDA_version_mm="${CUDA_version//'.'/''}"
           echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$CUDA_version_mm"

diff --git a/.github/workflows/ci-integrate.yml b/.github/workflows/ci-integrate.yml
@@ -34,7 +34,7 @@ jobs:
           - { python-version: "3.10", os: "windows" } # todo: https://discuss.pytorch.org/t/numpy-is-not-available-error/146192
         include:
           - { python-version: "3.10", requires: "latest", os: "ubuntu-22.04" }
-          - { python-version: "3.10", requires: "latest", os: "macOS-14" } # M1 machine
+          # - { python-version: "3.10", requires: "latest", os: "macOS-14" } # M1 machine # todo: crashing for MPS out of memory
     env:
       PYTORCH_URL: "https://download.pytorch.org/whl/cpu/torch_stable.html"
       FREEZE_REQUIREMENTS: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}

diff --git a/requirements/_integrate.txt b/requirements/_integrate.txt
@@ -1,4 +1,4 @@
 # contentiously validated integration with these expected ranges
 
 # ToDo: investigate and add validation with 2.0+ on GPU
-pytorch-lightning >=1.9.0, <2.0.0
+pytorch-lightning >=1.9.0, <2.1.0
diff --git a/tests/integrations/test_lightning.py b/tests/integrations/test_lightning.py
@@ -19,20 +19,23 @@
 from torch.nn import Linear
 
 if module_available("lightning"):
-    from lightning.pytorch import LightningModule, Trainer
+    from lightning.pytorch import LightningModule, Trainer, seed_everything
     from lightning.pytorch.loggers import CSVLogger
 else:
-    from pytorch_lightning import LightningModule, Trainer
+    from pytorch_lightning import LightningModule, Trainer, seed_everything
     from pytorch_lightning.loggers import CSVLogger
 
 from torchmetrics import MetricCollection
 from torchmetrics.aggregation import SumMetric
 from torchmetrics.classification import BinaryAccuracy, BinaryAveragePrecision
 from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError
+from torchmetrics.utilities.prints import rank_zero_only
 from torchmetrics.wrappers import MultitaskWrapper
 
 from integrations.lightning.boring_model import BoringModel
 
+seed_everything(42)
+
 
 class DiffMetric(SumMetric):
     """DiffMetric inherited from `SumMetric` by overidding its `update` method."""
@@ -239,18 +242,28 @@ def training_step(self, batch, batch_idx):
 
     model = TestModel()
 
-    logger = CSVLogger("tmpdir/logs")
+    class CustomCSVLogger(CSVLogger):
+        """Custom CSVLogger that does not call `experiment.save()` to prevent state being reset."""
+
+        @rank_zero_only
+        def save(self) -> None:
+            pass
+
+    logger = CustomCSVLogger("tmpdir/logs")
+    # is_cuda = torch.cuda.is_available()
+    # cuda_extra = {"devices": int(is_cuda)} if is_cuda else {}
     trainer = Trainer(
         default_root_dir=tmpdir,
         limit_train_batches=2,
         limit_val_batches=0,
         max_epochs=2,
         log_every_n_steps=1,
         logger=logger,
+        # **cuda_extra,
     )
     trainer.fit(model)
 
-    logged_metrics = logger._experiment.metrics
+    logged_metrics = logger.experiment.metrics
 
     epoch_0_step_0 = logged_metrics[0]
     assert "metric_forward" in epoch_0_step_0
@@ -336,13 +349,16 @@ def on_train_epoch_end(self):
             self.log_dict({f"{k}_epoch": v for k, v in metric_vals.items()})
 
     model = TestModel()
+    # is_cuda = torch.cuda.is_available()
+    # cuda_extra = {"devices": int(is_cuda)} if is_cuda else {}
 
     trainer = Trainer(
         default_root_dir=tmpdir,
         limit_train_batches=2,
         limit_val_batches=0,
         max_epochs=1,
         log_every_n_steps=1,
+        # **cuda_extra,
     )
     trainer.fit(model)