Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(deps): bump torch from 1.11.0 to 2.3.0 #9726

Merged
merged 11 commits into from
Aug 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ boto3>=1.24.4,<2.0
# build requirements
tensorflow==2.12.0; sys_platform != 'darwin' or platform_machine != 'arm64'
tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64'
torch==1.11.0
torchvision==0.12.0
torch==2.3.0
torchvision==0.18.0
numpy<2
2 changes: 1 addition & 1 deletion e2e_tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ check:
-s ./tests/fixtures/pytorch_amp/data.py
black . --check
python -m flake8
mypy --exclude fixtures tests
mypy tests

test:
pytest -vv -s --durations=0 tests
6 changes: 6 additions & 0 deletions e2e_tests/mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,15 @@ warn_redundant_casts = True
warn_return_any = True
warn_unused_configs = True
warn_unused_ignores = True
exclude = fixtures

[mypy-azure.storage.blob.*]
; starting in azure 12.9.0:
; site-packages/azure/storage/blob/_serialize.py:129: error: Type signature has too many arguments
; site-packages/azure/storage/blob/_blob_client.py:1406: error: Type signature has too few arguments
follow_imports = skip

[mypy-torch.*]
; starting in torch 2.2.0
; /tmp/venv/lib/python3.8/site-packages/torch/distributed/optim/apply_optimizer_in_backward.py: error: INTERNAL ERROR
follow_imports = skip
4 changes: 2 additions & 2 deletions e2e_tests/tests/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ appdirs
# pytest 6.0 has linter-breaking changes
pytest>=6.0.1
pytest-timeout
torch==1.11.0
torchvision==0.12.0
torch==2.3.0
torchvision==0.18.0
tensorflow==2.12.0; sys_platform != 'darwin' or platform_machine != 'arm64'
tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64'
pandas
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,10 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
id2label[str(i)] = label

# Load the accuracy metric from the datasets package
metric = datasets.load_metric("accuracy", trust_remote_code=True,)
metric = datasets.load_metric(
"accuracy",
trust_remote_code=True,
)

# Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
Expand Down
2 changes: 1 addition & 1 deletion harness/determined/pytorch/_experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def use_amp(self) -> None:
PyTorch 1.6 or greater is required for this feature.
"""
if HAVE_AMP:
self._parent.wrap_scaler(amp.GradScaler()) # type: ignore
self._parent.wrap_scaler(amp.GradScaler())
self._auto_amp = True

def disable_dataset_reproducibility_checks(self) -> None:
Expand Down
4 changes: 1 addition & 3 deletions harness/determined/pytorch/_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,7 @@ def load_trial_from_checkpoint_path(

trial = trial_class(trial_context, **trial_kwargs) # type: ignore

checkpoint = torch.load( # type: ignore
str(ckpt_dir.joinpath("state_dict.pth")), **torch_load_kwargs
)
checkpoint = torch.load(str(ckpt_dir.joinpath("state_dict.pth")), **torch_load_kwargs)

# We are still backwards compatible with checkpoints saved in the pre-0.12.13 PyTorchTrial API,
# but when we can guarantee that the pre-0.12.13 API was not in use, we avoid checking for a
Expand Down
2 changes: 1 addition & 1 deletion harness/determined/pytorch/_pytorch_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,7 +897,7 @@ def step_fn() -> None:
scaler.step(optimizer) # type: ignore

else:
step_fn = optimizer.step # type: ignore
step_fn = optimizer.step

# In the case of PyTorch DDP, losses are synchronized automatically on the backwards() pass
if self.distributed.size > 1 and self._distributed_backend.use_horovod():
Expand Down
8 changes: 4 additions & 4 deletions harness/determined/pytorch/_pytorch_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,9 @@ def pre_execute_hook(
hvd.init()
if distributed_backend.use_torch():
if torch.cuda.is_available():
dist.init_process_group(backend="nccl") # type: ignore
dist.init_process_group(backend="nccl")
else:
dist.init_process_group(backend="gloo") # type: ignore
dist.init_process_group(backend="gloo")

cls._set_random_seeds(trial_seed)

Expand Down Expand Up @@ -912,7 +912,7 @@ def _train_batch(

return training_metrics

@torch.no_grad() # type: ignore
@torch.no_grad()
def _validate(self, searcher_op: Optional[core.SearcherOperation] = None) -> Dict[str, Any]:
# Report a validation step is starting.
if self.is_chief:
Expand Down Expand Up @@ -1103,7 +1103,7 @@ def _load(self, load_path: pathlib.Path) -> None:
for ckpt_path in potential_paths:
maybe_ckpt = load_path.joinpath(*ckpt_path)
if maybe_ckpt.exists():
checkpoint = torch.load(str(maybe_ckpt), map_location="cpu") # type: ignore
checkpoint = torch.load(str(maybe_ckpt), map_location="cpu")
break

if checkpoint is None or not isinstance(checkpoint, dict):
Expand Down
4 changes: 2 additions & 2 deletions harness/determined/pytorch/_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,9 @@ def _initialize_distributed_backend() -> Optional[core.DistributedContext]:
return core.DistributedContext.from_horovod(horovod.hvd)
elif distributed_backend.use_torch():
if torch.cuda.is_available():
dist.init_process_group(backend="nccl") # type: ignore
dist.init_process_group(backend="nccl")
else:
dist.init_process_group(backend="gloo") # type: ignore
dist.init_process_group(backend="gloo")
return core.DistributedContext.from_torch_distributed()
elif info and (len(info.container_addrs) > 1 or len(info.slot_ids) > 1):
raise ValueError(
Expand Down
4 changes: 2 additions & 2 deletions harness/determined/pytorch/deepspeed/_deepspeed_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ def _train_for_step(

return metrics

@torch.no_grad() # type: ignore
@torch.no_grad()
def _compute_validation_metrics(self) -> workload.Response:
self.context.reset_reducers()
# Set the behavior of certain layers (e.g., dropout) that are
Expand Down Expand Up @@ -617,7 +617,7 @@ def _load(self, load_path: pathlib.Path) -> None:
if not maybe_ckpt.exists():
return

checkpoint = torch.load(str(maybe_ckpt), map_location="cpu") # type: ignore
checkpoint = torch.load(str(maybe_ckpt), map_location="cpu")
if not isinstance(checkpoint, dict):
raise det.errors.InvalidExperimentException(
f"Expected checkpoint at {maybe_ckpt} to be a dict "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ def _initialize_distributed_backend() -> Optional[core.DistributedContext]:
distributed_backend = det._DistributedBackend()
if distributed_backend.use_torch():
if torch.cuda.is_available():
dist.init_process_group(backend="nccl") # type: ignore
dist.init_process_group(backend="nccl")
else:
dist.init_process_group(backend="gloo") # type: ignore
dist.init_process_group(backend="gloo")
return core.DistributedContext.from_torch_distributed()

info = det.get_cluster_info()
Expand Down
2 changes: 1 addition & 1 deletion harness/tests/experiment/pytorch/test_pytorch_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,6 @@ def test_training_not_started(self) -> None:
self.context._should_communicate_and_update()

def test_wrap_scaler(self) -> None:
scaler = torch.cuda.amp.GradScaler() # type: ignore # GradScaler.__init__ is untyped
scaler = torch.cuda.amp.GradScaler()
assert scaler == self.context.wrap_scaler(scaler)
assert scaler == self.context._scaler
6 changes: 3 additions & 3 deletions harness/tests/requirements/requirements-harness.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ requests_mock
coverage
deepspeed==0.8.3
# lightning not tested but required for linter checks
lightning==1.9
lightning
transformers>=4.8.2,<4.29.0
torch==1.11.0
torchvision==0.12.0
torch==2.3.0
torchvision==0.18.0
tensorflow==2.12.0; sys_platform != 'darwin' or platform_machine != 'arm64'
tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64'
attrdict3
Expand Down
2 changes: 1 addition & 1 deletion model_hub/model_hub/mmdetection/_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def train_batch(self, batch: Any, epoch_idx: int, batch_idx: int) -> Dict[str, t
def evaluate_batch(self, batch: Any, batch_idx: int) -> Dict[str, Any]:
batch = self.to_device(batch)
batch = {key: batch[key][0].data for key in batch}
with torch.no_grad(): # type: ignore
with torch.no_grad():
result = self.model(return_loss=False, rescale=True, **batch)
if isinstance(result[0], tuple):
result = [
Expand Down
8 changes: 4 additions & 4 deletions model_hub/model_hub/mmdetection/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def get_pretrained_ckpt_path(download_directory: str, config_file: str) -> Tuple
ckpt_path = model_hub.utils.download_url(
download_directory, CONFIG_TO_PRETRAINED[config_file]
)
return ckpt_path, torch.load(ckpt_path) # type: ignore
return ckpt_path, torch.load(ckpt_path)
return None, None


Expand Down Expand Up @@ -93,11 +93,11 @@ def build_fp16_loss_scaler(loss_scale: mmcv.Config) -> Any:
... )
"""
if loss_scale == "dynamic":
loss_scaler = torch.cuda.amp.GradScaler() # type: ignore
loss_scaler = torch.cuda.amp.GradScaler()
elif isinstance(loss_scale, float):
loss_scaler = torch.cuda.amp.GradScaler(init_scale=loss_scale) # type: ignore
loss_scaler = torch.cuda.amp.GradScaler(init_scale=loss_scale)
elif isinstance(loss_scale, dict):
loss_scaler = torch.cuda.amp.GradScaler(**loss_scale) # type: ignore
loss_scaler = torch.cuda.amp.GradScaler(**loss_scale)
else:
raise Exception(
"Cannot parse fp16 configuration. Expected cfg to be str(dynamic), float or dict."
Expand Down
Loading