From 60f0fdc9115d776022d09b4827343cdf79cb9d0f Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Date: Tue, 20 Feb 2024 18:19:31 -0800 Subject: [PATCH] Fix gradient clipping (#5150) The gradient clipping API doesn't apply the coefficient correctly. This PR resolves the issue and adds a test case. Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/utils.py | 2 +- tests/unit/runtime/test_runtime_utils.py | 25 ++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/deepspeed/runtime/utils.py b/deepspeed/runtime/utils.py index d7a35b7dbbe9..d1ebe4b2f83d 100755 --- a/deepspeed/runtime/utils.py +++ b/deepspeed/runtime/utils.py @@ -407,7 +407,7 @@ def clip_grad_norm_(parameters, max_norm, norm_type=2, mpu=None): max_norm = torch.tensor([float(max_norm)], device=parameters[0].device) clip_coef = max_norm / (total_norm + 1e-6) tmp_tensor = torch.tensor([1.0], device=parameters[0].device) - clip_coef = torch.max(tmp_tensor, clip_coef) + clip_coef = torch.min(tmp_tensor, clip_coef) for p in parameters: p.grad.data.mul_(clip_coef) return total_norm diff --git a/tests/unit/runtime/test_runtime_utils.py b/tests/unit/runtime/test_runtime_utils.py index 5d8478b249be..6fdeb2074246 100644 --- a/tests/unit/runtime/test_runtime_utils.py +++ b/tests/unit/runtime/test_runtime_utils.py @@ -26,10 +26,10 @@ def test_call_to_str(): assert c2s('hello', 1138, val=3) == 'hello(1138, val=3)' -class TestClibGradNorm(DistributedTest): +class TestClipGradNorm(DistributedTest): world_size = 2 - def test(self): + def test_gather(self): param1 = torch.nn.Parameter(torch.Tensor([0])) param1.grad = torch.Tensor([1]) param2 = torch.nn.Parameter(torch.Tensor([0])) @@ -50,6 +50,27 @@ def test(self): assert gathered_norm[0] == gathered_norm[1], "norm at rank 0 does not match the norm at rank 1" + def test_clipped_val(self): + max_norm = 0.1 + + def test_params(): + param1 = torch.nn.Parameter(torch.Tensor([0])) + param1.grad = torch.Tensor([1]) + param2 = torch.nn.Parameter(torch.Tensor([0])) + param2.grad = torch.Tensor([1]) + return [param1, param2] + + # This assumes gradients are same on all the ranks and doesn't consider multiple ranks + params_expected = test_params() + torch.nn.utils.clip_grad_norm_(params_expected, max_norm) + + params_actual = test_params() + ds_utils.clip_grad_norm_(params_actual, max_norm=max_norm) + + # This can be allclose + assert torch.equal(params_expected[0].grad, params_actual[0].grad) + assert torch.equal(params_expected[1].grad, params_actual[1].grad) + @pytest.mark.parametrize("check_using_norm", [(False), (True)]) class TestCheckOverflow(DistributedTest):