Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

part 5 of #3733 #3774

Merged
merged 3 commits into from
Oct 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions tests/backends/test_ddp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import pytest
import torch
import os
from tests.backends import ddp_model
from tests.utilities.dist import call_training_script


@pytest.mark.parametrize('cli_args', [
pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'),
])
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_multi_gpu_model_ddp_fit_only(tmpdir, cli_args):
# call the script
std, err = call_training_script(ddp_model, cli_args, 'fit', tmpdir, timeout=120)

# load the results of the script
result_path = os.path.join(tmpdir, 'ddp.result')
result = torch.load(result_path)

# verify the file wrote the expected outputs
assert result['status'] == 'complete'


@pytest.mark.parametrize('cli_args', [
pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'),
])
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_multi_gpu_model_ddp_test_only(tmpdir, cli_args):
# call the script
call_training_script(ddp_model, cli_args, 'test', tmpdir)

# load the results of the script
result_path = os.path.join(tmpdir, 'ddp.result')
result = torch.load(result_path)

# verify the file wrote the expected outputs
assert result['status'] == 'complete'


# @pytest.mark.parametrize('cli_args', [
# pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'),
# ])
# @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
# def test_multi_gpu_model_ddp_fit_test(tmpdir, cli_args):
# # call the script
# call_training_script(ddp_model, cli_args, 'fit_test', tmpdir, timeout=20)
#
# # load the results of the script
# result_path = os.path.join(tmpdir, 'ddp.result')
# result = torch.load(result_path)
#
# # verify the file wrote the expected outputs
# assert result['status'] == 'complete'
#
# model_outs = result['result']
# for out in model_outs:
# assert out['test_acc'] > 0.90
44 changes: 0 additions & 44 deletions tests/models/data/ddp/train_test_variations.py

This file was deleted.

57 changes: 0 additions & 57 deletions tests/models/test_gpu.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,18 @@
import os
import subprocess
import sys
from collections import namedtuple
from pathlib import Path
from unittest.mock import patch

import pytest
import torch
from torchtext.data import Batch, Dataset, Example, Field, LabelField

import pytorch_lightning
import tests.base.develop_pipelines as tpipes
import tests.base.develop_utils as tutils
from pytorch_lightning import Trainer
from pytorch_lightning.core import memory
from pytorch_lightning.utilities import device_parser
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.base import EvalModelTemplate
from tests.models.data.ddp import train_test_variations
from pytorch_lightning.accelerators.gpu_backend import GPUBackend
from pytorch_lightning.accelerators.cpu_backend import CPUBackend


PRETEND_N_OF_GPUS = 16
Expand Down Expand Up @@ -84,34 +77,6 @@ def test_multi_gpu_model_dp(tmpdir):
memory.get_memory_profile('min_max')


@pytest.mark.parametrize('cli_args', [
pytest.param('--max_epochs 1 --gpus 2 --distributed_backend ddp'),
])
@pytest.mark.parametrize('variation', train_test_variations.get_variations())
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_multi_gpu_model_ddp(tmpdir, cli_args, variation):
""" Runs a basic training and test run with distributed_backend=ddp. """
file = Path(train_test_variations.__file__).absolute()
cli_args = cli_args.split(' ') if cli_args else []
cli_args += ['--default_root_dir', str(tmpdir)]
cli_args += ['--variation', variation]
command = [sys.executable, str(file)] + cli_args

# need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment
env = os.environ.copy()
env['PYTHONPATH'] = f'{pytorch_lightning.__file__}:' + env.get('PYTHONPATH', '')

# for running in ddp mode, we need to lauch it's own process or pytest will get stuck
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)

std, err = p.communicate(timeout=60)
std = std.decode('utf-8').strip()
err = err.decode('utf-8').strip()
assert std, f"{variation} produced no output"
if p.returncode > 0:
pytest.fail(err)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
@pytest.mark.parametrize('gpus', [1, [0], [1]])
def test_single_gpu_model(tmpdir, gpus):
Expand All @@ -129,28 +94,6 @@ def test_single_gpu_model(tmpdir, gpus):
tpipes.run_model_test(trainer_options, model)


@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_ddp_all_dataloaders_passed_to_fit(tmpdir):
"""Make sure DDP works with dataloaders passed to fit()"""
tutils.set_random_master_port()

model = EvalModelTemplate()
fit_options = dict(train_dataloader=model.train_dataloader(),
val_dataloaders=model.val_dataloader())

trainer = Trainer(
default_root_dir=tmpdir,
progress_bar_refresh_rate=0,
max_epochs=1,
limit_train_batches=0.2,
limit_val_batches=0.2,
gpus=[0, 1],
distributed_backend='ddp_spawn'
)
result = trainer.fit(model, **fit_options)
assert result == 1, "DDP doesn't work with dataloaders passed to fit()."


@pytest.fixture
def mocked_device_count(monkeypatch):
def device_count():
Expand Down