ci: Cache datasets (#315)

* Add conftest.py * Add dataset caching * Use tmpdir to use cached dataset and remove unused tmpdir * Use data_dir for datasets and tmpdir for logs and weights * Use cached datasets in cli tests * Keep line length <= 120 * Remove unnecessary import * Add TODO * Use DATA_DIR to specify path in pytest.mark.parametrize * Fix typo * Add notes * Fix data_dir in LitMNIST * data_dir * clean names * Path() * City Co-authored-by: Jirka Borovec <[email protected]>
Lightning-Universe · Nov 6, 2020 · bc01085 · bc01085
1 parent ef34a17
commit bc01085
Show file tree

Hide file tree

Showing 24 changed files with 123 additions and 92 deletions.
diff --git a/.github/workflows/ci_test-full.yml b/.github/workflows/ci_test-full.yml
@@ -74,12 +74,11 @@ jobs:
         pip list
       shell: bash
 
-    #- name: Cache datasets
-    #  uses: actions/cache@v2
-    #  with:
-    #    path: Datasets # This path is specific to Ubuntu
-    #    # Look to see if there is a cache hit for the corresponding requirements file
-    #    key: pl-datasets
+    - name: Cache datasets
+      uses: actions/cache@v2
+      with:
+        path: ./datasets
+        key: pl-datasets-${{ hashFiles('tests/conftest.py') }}
 
     - name: Tests
       run: |

diff --git a/pl_bolts/models/mnist_module.py b/pl_bolts/models/mnist_module.py
@@ -1,4 +1,3 @@
-import os
 from argparse import ArgumentParser
 from warnings import warn
 
@@ -70,7 +69,7 @@ def val_dataloader(self):
         return loader
 
     def test_dataloader(self):
-        test_dataset = MNIST(os.getcwd(), train=False, download=True, transform=transforms.ToTensor())
+        test_dataset = MNIST(self.hparams.data_dir, train=False, download=True, transform=transforms.ToTensor())
         loader = DataLoader(test_dataset, batch_size=self.hparams.batch_size, num_workers=self.hparams.num_workers)
         return loader
 

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -2,8 +2,9 @@
 
 from pytorch_lightning import seed_everything
 
-TEST_ROOT = os.path.dirname(__file__)
+TEST_ROOT = os.path.realpath(os.path.dirname(__file__))
 PACKAGE_ROOT = os.path.dirname(TEST_ROOT)
+DATASETS_PATH = os.path.join(PACKAGE_ROOT, 'datasets')
 # generate a list of random seeds for each test
 ROOT_SEED = 1234
 

diff --git a/tests/callbacks/test_info_callbacks.py b/tests/callbacks/test_info_callbacks.py
@@ -1,7 +1,7 @@
 from pl_bolts.callbacks import PrintTableMetricsCallback
 
 
-def test_printtable_metrics_callback(tmpdir):
+def test_printtable_metrics_callback():
     callback = PrintTableMetricsCallback()
 
     metrics_a = {'loss': 1.0, 'epoch': 0}

diff --git a/tests/callbacks/test_param_update_callbacks.py b/tests/callbacks/test_param_update_callbacks.py
@@ -6,7 +6,7 @@
 from pl_bolts.callbacks.byol_updates import BYOLMAWeightUpdate
 
 
-def test_byol_ma_weight_update_callback(tmpdir):
+def test_byol_ma_weight_update_callback():
     a = nn.Linear(100, 10)
     b = deepcopy(a)
     a_original = deepcopy(a)

diff --git a/tests/callbacks/test_variational_callbacks.py b/tests/callbacks/test_variational_callbacks.py
@@ -4,7 +4,7 @@
 from pl_bolts.models.gans import GAN
 
 
-def test_latent_dim_interpolator(tmpdir):
+def test_latent_dim_interpolator():
 
     class FakeTrainer(object):
         def __init__(self):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,15 @@
+from pathlib import Path
+
+import pytest
+
+
+# GitHub Actions use this path to cache datasets.
+# Use `datadir` fixture where possible and use `DATASETS_PATH` in
+# `pytest.mark.parametrize()` where you cannot use `datadir`.
+# https://github.com/pytest-dev/pytest/issues/349
+from tests import DATASETS_PATH
+
+
+@pytest.fixture(scope="session")
+def datadir():
+    return Path(DATASETS_PATH)
diff --git a/tests/datamodules/test_dataloader.py b/tests/datamodules/test_dataloader.py
@@ -5,8 +5,8 @@
 from pl_bolts.datasets.cifar10_dataset import CIFAR10
 
 
-def test_async_dataloader(tmpdir):
-    ds = CIFAR10(tmpdir)
+def test_async_dataloader(datadir):
+    ds = CIFAR10(data_dir=datadir)
 
     if torch.cuda.device_count() > 0:  # Can only run this test with a GPU
         device = torch.device('cuda', 0)

diff --git a/tests/datamodules/test_datamodules.py b/tests/datamodules/test_datamodules.py
@@ -6,8 +6,9 @@
 from pl_bolts.datamodules import CityscapesDataModule
 
 
-def test_dev_datasets(tmpdir):
-    ds = CIFAR10(tmpdir)
+def test_dev_datasets(datadir):
+
+    ds = CIFAR10(data_dir=datadir)
     for b in ds:
         pass
 
@@ -35,14 +36,14 @@ def _create_synth_Cityscapes_dataset(path_dir):
                 fine_labels_dir / split / city / semantic_target_name)
 
 
-def test_cityscapes_datamodule(tmpdir):
+def test_cityscapes_datamodule(datadir):
 
-    _create_synth_Cityscapes_dataset(tmpdir)
+    _create_synth_Cityscapes_dataset(datadir)
 
     batch_size = 1
     target_types = ['semantic', 'instance']
     for target_type in target_types:
-        dm = CityscapesDataModule(tmpdir,
+        dm = CityscapesDataModule(datadir,
                                   num_workers=0,
                                   batch_size=batch_size,
                                   target_type=target_type)

diff --git a/tests/datamodules/test_sklearn_dataloaders.py b/tests/datamodules/test_sklearn_dataloaders.py
@@ -12,7 +12,7 @@
          ' install it with `pip install sklearn`.')
 
 
-def test_dataloader(tmpdir):
+def test_dataloader():
     seed_everything()
 
     X = np.random.rand(5, 2)

diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py
@@ -3,31 +3,31 @@
 from pl_bolts.datasets import DummyDataset, RandomDataset, RandomDictDataset, RandomDictStringDataset
 
 
-def test_dummy_ds(tmpdir):
+def test_dummy_ds():
     ds = DummyDataset((1, 2), num_samples=100)
     dl = DataLoader(ds)
 
     for b in dl:
         pass
 
 
-def test_rand_ds(tmpdir):
+def test_rand_ds():
     ds = RandomDataset(32, num_samples=100)
     dl = DataLoader(ds)
 
     for b in dl:
         pass
 
 
-def test_rand_dict_ds(tmpdir):
+def test_rand_dict_ds():
     ds = RandomDictDataset(32, num_samples=100)
     dl = DataLoader(ds)
 
     for b in dl:
         pass
 
 
-def test_rand_str_dict_ds(tmpdir):
+def test_rand_str_dict_ds():
     ds = RandomDictStringDataset(32, num_samples=100)
     dl = DataLoader(ds)
 

diff --git a/tests/models/self_supervised/test_models.py b/tests/models/self_supervised/test_models.py
@@ -15,14 +15,14 @@
 
 # TODO: this test is hanging (runs for more then 10min) so we need to use GPU or optimize it...
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_cpcv2(tmpdir):
+def test_cpcv2(tmpdir, datadir):
     seed_everything()
 
-    datamodule = CIFAR10DataModule(data_dir=tmpdir, num_workers=0, batch_size=2)
+    datamodule = CIFAR10DataModule(data_dir=datadir, num_workers=0, batch_size=2)
     datamodule.train_transforms = CPCTrainTransformsCIFAR10()
     datamodule.val_transforms = CPCEvalTransformsCIFAR10()
 
-    model = CPCV2(encoder='resnet18', data_dir=tmpdir, batch_size=2, online_ft=True, datamodule=datamodule)
+    model = CPCV2(encoder='resnet18', data_dir=datadir, batch_size=2, online_ft=True, datamodule=datamodule)
     trainer = pl.Trainer(fast_dev_run=True, max_epochs=1, default_root_dir=tmpdir)
     trainer.fit(model)
     loss = trainer.progress_bar_dict['val_nce']
@@ -32,51 +32,51 @@ def test_cpcv2(tmpdir):
 
 # TODO: this test is hanging (runs for more then 10min) so we need to use GPU or optimize it...
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-def test_byol(tmpdir):
+def test_byol(tmpdir, datadir):
     seed_everything()
 
-    datamodule = CIFAR10DataModule(data_dir=tmpdir, num_workers=0, batch_size=2)
+    datamodule = CIFAR10DataModule(data_dir=datadir, num_workers=0, batch_size=2)
     datamodule.train_transforms = CPCTrainTransformsCIFAR10()
     datamodule.val_transforms = CPCEvalTransformsCIFAR10()
 
-    model = BYOL(data_dir=tmpdir, num_classes=datamodule)
+    model = BYOL(data_dir=datadir, num_classes=datamodule)
     trainer = pl.Trainer(fast_dev_run=True, max_epochs=1, default_root_dir=tmpdir, max_steps=2)
     trainer.fit(model, datamodule)
     loss = trainer.progress_bar_dict['loss']
 
     assert float(loss) < 1.0
 
 
-def test_amdim(tmpdir):
+def test_amdim(tmpdir, datadir):
     seed_everything()
 
-    model = AMDIM(data_dir=tmpdir, batch_size=2, online_ft=True, encoder='resnet18')
+    model = AMDIM(data_dir=datadir, batch_size=2, online_ft=True, encoder='resnet18')
     trainer = pl.Trainer(fast_dev_run=True, max_epochs=1, default_root_dir=tmpdir)
     trainer.fit(model)
     loss = trainer.progress_bar_dict['loss']
 
     assert float(loss) > 0
 
 
-def test_moco(tmpdir):
+def test_moco(tmpdir, datadir):
     seed_everything()
 
-    datamodule = CIFAR10DataModule(tmpdir, num_workers=0, batch_size=2)
+    datamodule = CIFAR10DataModule(data_dir=datadir, num_workers=0, batch_size=2)
     datamodule.train_transforms = Moco2TrainCIFAR10Transforms()
     datamodule.val_transforms = Moco2EvalCIFAR10Transforms()
 
-    model = MocoV2(data_dir=tmpdir, batch_size=2, online_ft=True)
+    model = MocoV2(data_dir=datadir, batch_size=2, online_ft=True)
     trainer = pl.Trainer(fast_dev_run=True, max_epochs=1, default_root_dir=tmpdir, callbacks=[MocoLRScheduler()])
     trainer.fit(model, datamodule=datamodule)
     loss = trainer.progress_bar_dict['loss']
 
     assert float(loss) > 0
 
 
-def test_simclr(tmpdir):
+def test_simclr(tmpdir, datadir):
     seed_everything()
 
-    datamodule = CIFAR10DataModule(tmpdir, num_workers=0, batch_size=2)
+    datamodule = CIFAR10DataModule(data_dir=datadir, num_workers=0, batch_size=2)
     datamodule.train_transforms = SimCLRTrainDataTransform(32)
     datamodule.val_transforms = SimCLREvalDataTransform(32)
 
@@ -88,14 +88,14 @@ def test_simclr(tmpdir):
     assert float(loss) > 0
 
 
-def test_swav(tmpdir):
+def test_swav(tmpdir, datadir):
     seed_everything()
 
     batch_size = 2
 
     # inputs, y = batch  (doesn't receive y for some reason)
     datamodule = CIFAR10DataModule(
-        data_dir=tmpdir,
+        data_dir=datadir,
         batch_size=batch_size,
         num_workers=0
     )

diff --git a/tests/models/self_supervised/test_resnets.py b/tests/models/self_supervised/test_resnets.py
@@ -16,7 +16,7 @@
 )
 
 
-def test_cpc_resnet(tmpdir):
+def test_cpc_resnet():
     x = torch.rand(3, 3, 64, 64)
     model = cpc_resnet50(x)
     model(x)
@@ -33,7 +33,7 @@ def test_cpc_resnet(tmpdir):
     wide_resnet50_2,
     wide_resnet101_2
 ])
-def test_torchvision_resnets(tmpdir, model_class):
+def test_torchvision_resnets(model_class):
     x = torch.rand(3, 3, 64, 64)
     model = model_class()
     model(x)
@@ -44,7 +44,7 @@ def test_torchvision_resnets(tmpdir, model_class):
     64,
     128
 ])
-def test_amdim_encoder(tmpdir, size):
+def test_amdim_encoder(size):
     dummy_batch = torch.zeros((2, 3, size, size))
     model = AMDIMEncoder(dummy_batch, encoder_size=size)
     model.init_weights()

diff --git a/tests/models/self_supervised/test_scripts.py b/tests/models/self_supervised/test_scripts.py
@@ -3,8 +3,12 @@
 import pytest
 import torch
 
+from tests import DATASETS_PATH
 
-@pytest.mark.parametrize('cli_args', ["--max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2"])
+
+@pytest.mark.parametrize('cli_args', [
+    f"--data_dir {DATASETS_PATH} --max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2"
+])
 def test_cli_run_self_supervised_amdim(cli_args):
     """Test running CLI for an example with default params."""
     from pl_bolts.models.self_supervised.amdim.amdim_module import cli_main
@@ -16,7 +20,9 @@ def test_cli_run_self_supervised_amdim(cli_args):
 
 # TODO: this test is hanging (runs for more then 10min) so we need to use GPU or optimize it...
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
-@pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2 --encoder resnet18'])
+@pytest.mark.parametrize('cli_args', [
+    f'--data_dir {DATASETS_PATH} --max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2 --encoder resnet18'
+])
 def test_cli_run_self_supervised_cpc(cli_args):
     """Test running CLI for an example with default params."""
     from pl_bolts.models.self_supervised.cpc.cpc_module import cli_main
@@ -26,7 +32,9 @@ def test_cli_run_self_supervised_cpc(cli_args):
         cli_main()
 
 
-@pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2'])
+@pytest.mark.parametrize('cli_args', [
+    f'--data_dir {DATASETS_PATH} --max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2'
+])
 def test_cli_run_self_supervised_moco(cli_args):
     """Test running CLI for an example with default params."""
     from pl_bolts.models.self_supervised.moco.moco2_module import cli_main
@@ -36,7 +44,9 @@ def test_cli_run_self_supervised_moco(cli_args):
         cli_main()
 
 
-@pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2 --online_ft'])
+@pytest.mark.parametrize('cli_args', [
+    f'--data_dir {DATASETS_PATH} --max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2 --online_ft'
+])
 def test_cli_run_self_supervised_simclr(cli_args):
     """Test running CLI for an example with default params."""
     from pl_bolts.models.self_supervised.simclr.simclr_module import cli_main
@@ -46,7 +56,9 @@ def test_cli_run_self_supervised_simclr(cli_args):
         cli_main()
 
 
-@pytest.mark.parametrize('cli_args', ['--max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2 --online_ft'])
+@pytest.mark.parametrize('cli_args', [
+    f'--data_dir {DATASETS_PATH} --max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2 --online_ft'
+])
 def test_cli_run_self_supervised_byol(cli_args):
     """Test running CLI for an example with default params."""
     from pl_bolts.models.self_supervised.byol.byol_module import cli_main
@@ -58,8 +70,8 @@ def test_cli_run_self_supervised_byol(cli_args):
 
 @pytest.mark.parametrize(
     'cli_args', [
-        '--max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2 --gpus 0 --arch resnet18'
-        ' --hidden_mlp 512 --fp32 --sinkhorn_iterations 1 --nmb_prototypes 2 --dataset cifar10'
+        f'--dataset cifar10 --data_path {DATASETS_PATH} --max_epochs 1 --max_steps 3 --fast_dev_run --batch_size 2'
+        ' --gpus 0 --arch resnet18 --hidden_mlp 512 --fp32 --sinkhorn_iterations 1 --nmb_prototypes 2'
     ]
 )
 def test_cli_run_self_supervised_swav(cli_args):