determined-ai · jgongd · Aug 8, 2024 · Jul 25, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -30,6 +30,6 @@ boto3>=1.24.4,<2.0
 # build requirements
 tensorflow==2.12.0; sys_platform != 'darwin' or platform_machine != 'arm64'
 tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64'
-torch==1.11.0
-torchvision==0.12.0
+torch==2.3.0
+torchvision==0.18.0
 numpy<2
diff --git a/e2e_tests/Makefile b/e2e_tests/Makefile
@@ -15,7 +15,7 @@ check:
 		-s ./tests/fixtures/pytorch_amp/data.py
 	black . --check
 	python -m flake8
-	mypy --exclude fixtures tests
+	mypy tests
 
 test:
 	pytest -vv -s --durations=0 tests
diff --git a/e2e_tests/mypy.ini b/e2e_tests/mypy.ini
@@ -17,9 +17,15 @@ warn_redundant_casts = True
 warn_return_any = True
 warn_unused_configs = True
 warn_unused_ignores = True
+exclude = fixtures
 
 [mypy-azure.storage.blob.*]
 ; starting in azure 12.9.0:
 ; site-packages/azure/storage/blob/_serialize.py:129: error: Type signature has too many arguments
 ; site-packages/azure/storage/blob/_blob_client.py:1406: error: Type signature has too few arguments
 follow_imports = skip
+
+[mypy-torch.*]
+; starting in torch 2.2.0
+; /tmp/venv/lib/python3.8/site-packages/torch/distributed/optim/apply_optimizer_in_backward.py: error: INTERNAL ERROR
+follow_imports = skip
@@ -2,8 +2,8 @@ appdirs
 # pytest 6.0 has linter-breaking changes
 pytest>=6.0.1
 pytest-timeout
-torch==1.11.0
-torchvision==0.12.0
+torch==2.3.0
+torchvision==0.18.0
 tensorflow==2.12.0; sys_platform != 'darwin' or platform_machine != 'arm64'
 tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64'
 pandas

diff --git a/examples/hf_trainer_api/hf_image_classification/image_classification.py b/examples/hf_trainer_api/hf_image_classification/image_classification.py
@@ -312,7 +312,10 @@ def main(det_callback, tb_callback, model_args, data_args, training_args):
         id2label[str(i)] = label
 
     # Load the accuracy metric from the datasets package
-    metric = datasets.load_metric("accuracy", trust_remote_code=True,)
+    metric = datasets.load_metric(
+        "accuracy",
+        trust_remote_code=True,
+    )
 
     # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
     # predictions and label_ids field) and has to return a dictionary string to float.

@@ -32,7 +32,7 @@ def use_amp(self) -> None:
         PyTorch 1.6 or greater is required for this feature.
         """
         if HAVE_AMP:
-            self._parent.wrap_scaler(amp.GradScaler())  # type: ignore
+            self._parent.wrap_scaler(amp.GradScaler())
             self._auto_amp = True
 
     def disable_dataset_reproducibility_checks(self) -> None:

@@ -205,9 +205,7 @@ def load_trial_from_checkpoint_path(
 
     trial = trial_class(trial_context, **trial_kwargs)  # type: ignore
 
-    checkpoint = torch.load(  # type: ignore
-        str(ckpt_dir.joinpath("state_dict.pth")), **torch_load_kwargs
-    )
+    checkpoint = torch.load(str(ckpt_dir.joinpath("state_dict.pth")), **torch_load_kwargs)
 
     # We are still backwards compatible with checkpoints saved in the pre-0.12.13 PyTorchTrial API,
     # but when we can guarantee that the pre-0.12.13 API was not in use, we avoid checking for a

@@ -897,7 +897,7 @@ def step_fn() -> None:
                 scaler.step(optimizer)  # type: ignore
 
         else:
-            step_fn = optimizer.step  # type: ignore
+            step_fn = optimizer.step
 
         # In the case of PyTorch DDP, losses are synchronized automatically on the backwards() pass
         if self.distributed.size > 1 and self._distributed_backend.use_horovod():

@@ -292,9 +292,9 @@ def pre_execute_hook(
             hvd.init()
         if distributed_backend.use_torch():
             if torch.cuda.is_available():
-                dist.init_process_group(backend="nccl")  # type: ignore
+                dist.init_process_group(backend="nccl")
             else:
-                dist.init_process_group(backend="gloo")  # type: ignore
+                dist.init_process_group(backend="gloo")
 
         cls._set_random_seeds(trial_seed)
 
@@ -912,7 +912,7 @@ def _train_batch(
 
         return training_metrics
 
-    @torch.no_grad()  # type: ignore
+    @torch.no_grad()
     def _validate(self, searcher_op: Optional[core.SearcherOperation] = None) -> Dict[str, Any]:
         # Report a validation step is starting.
         if self.is_chief:
@@ -1103,7 +1103,7 @@ def _load(self, load_path: pathlib.Path) -> None:
         for ckpt_path in potential_paths:
             maybe_ckpt = load_path.joinpath(*ckpt_path)
             if maybe_ckpt.exists():
-                checkpoint = torch.load(str(maybe_ckpt), map_location="cpu")  # type: ignore
+                checkpoint = torch.load(str(maybe_ckpt), map_location="cpu")
                 break
 
         if checkpoint is None or not isinstance(checkpoint, dict):

@@ -214,9 +214,9 @@ def _initialize_distributed_backend() -> Optional[core.DistributedContext]:
         return core.DistributedContext.from_horovod(horovod.hvd)
     elif distributed_backend.use_torch():
         if torch.cuda.is_available():
-            dist.init_process_group(backend="nccl")  # type: ignore
+            dist.init_process_group(backend="nccl")
         else:
-            dist.init_process_group(backend="gloo")  # type: ignore
+            dist.init_process_group(backend="gloo")
         return core.DistributedContext.from_torch_distributed()
     elif info and (len(info.container_addrs) > 1 or len(info.slot_ids) > 1):
         raise ValueError(

@@ -498,7 +498,7 @@ def _train_for_step(
 
         return metrics
 
-    @torch.no_grad()  # type: ignore
+    @torch.no_grad()
     def _compute_validation_metrics(self) -> workload.Response:
         self.context.reset_reducers()
         # Set the behavior of certain layers (e.g., dropout) that are
@@ -617,7 +617,7 @@ def _load(self, load_path: pathlib.Path) -> None:
         if not maybe_ckpt.exists():
             return
 
-        checkpoint = torch.load(str(maybe_ckpt), map_location="cpu")  # type: ignore
+        checkpoint = torch.load(str(maybe_ckpt), map_location="cpu")
         if not isinstance(checkpoint, dict):
             raise det.errors.InvalidExperimentException(
                 f"Expected checkpoint at {maybe_ckpt} to be a dict "

@@ -164,9 +164,9 @@ def _initialize_distributed_backend() -> Optional[core.DistributedContext]:
     distributed_backend = det._DistributedBackend()
     if distributed_backend.use_torch():
         if torch.cuda.is_available():
-            dist.init_process_group(backend="nccl")  # type: ignore
+            dist.init_process_group(backend="nccl")
         else:
-            dist.init_process_group(backend="gloo")  # type: ignore
+            dist.init_process_group(backend="gloo")
         return core.DistributedContext.from_torch_distributed()
 
     info = det.get_cluster_info()

@@ -52,6 +52,6 @@ def test_training_not_started(self) -> None:
             self.context._should_communicate_and_update()
 
     def test_wrap_scaler(self) -> None:
-        scaler = torch.cuda.amp.GradScaler()  # type: ignore # GradScaler.__init__ is untyped
+        scaler = torch.cuda.amp.GradScaler()
         assert scaler == self.context.wrap_scaler(scaler)
         assert scaler == self.context._scaler
@@ -7,10 +7,10 @@ requests_mock
 coverage
 deepspeed==0.8.3
 # lightning not tested but required for linter checks
-lightning==1.9
+lightning
 transformers>=4.8.2,<4.29.0
-torch==1.11.0
-torchvision==0.12.0
+torch==2.3.0
+torchvision==0.18.0
 tensorflow==2.12.0; sys_platform != 'darwin' or platform_machine != 'arm64'
 tensorflow-macos==2.11.0; sys_platform == 'darwin' and platform_machine == 'arm64'
 attrdict3

diff --git a/model_hub/model_hub/mmdetection/_trial.py b/model_hub/model_hub/mmdetection/_trial.py
@@ -194,7 +194,7 @@ def train_batch(self, batch: Any, epoch_idx: int, batch_idx: int) -> Dict[str, t
     def evaluate_batch(self, batch: Any, batch_idx: int) -> Dict[str, Any]:
         batch = self.to_device(batch)
         batch = {key: batch[key][0].data for key in batch}
-        with torch.no_grad():  # type: ignore
+        with torch.no_grad():
             result = self.model(return_loss=False, rescale=True, **batch)
         if isinstance(result[0], tuple):
             result = [

diff --git a/model_hub/model_hub/mmdetection/utils.py b/model_hub/model_hub/mmdetection/utils.py
@@ -64,7 +64,7 @@ def get_pretrained_ckpt_path(download_directory: str, config_file: str) -> Tuple
         ckpt_path = model_hub.utils.download_url(
             download_directory, CONFIG_TO_PRETRAINED[config_file]
         )
-        return ckpt_path, torch.load(ckpt_path)  # type: ignore
+        return ckpt_path, torch.load(ckpt_path)
     return None, None
 
 
@@ -93,11 +93,11 @@ def build_fp16_loss_scaler(loss_scale: mmcv.Config) -> Any:
         ... )
     """
     if loss_scale == "dynamic":
-        loss_scaler = torch.cuda.amp.GradScaler()  # type: ignore
+        loss_scaler = torch.cuda.amp.GradScaler()
     elif isinstance(loss_scale, float):
-        loss_scaler = torch.cuda.amp.GradScaler(init_scale=loss_scale)  # type: ignore
+        loss_scaler = torch.cuda.amp.GradScaler(init_scale=loss_scale)
     elif isinstance(loss_scale, dict):
-        loss_scaler = torch.cuda.amp.GradScaler(**loss_scale)  # type: ignore
+        loss_scaler = torch.cuda.amp.GradScaler(**loss_scale)
     else:
         raise Exception(
             "Cannot parse fp16 configuration.  Expected cfg to be str(dynamic), float or dict."