Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle PermissionError in os.chmod Call - Update engine.py #4139

Merged
merged 10 commits into from
Aug 14, 2023
13 changes: 11 additions & 2 deletions deepspeed/runtime/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -3299,8 +3299,17 @@ def _copy_recovery_script(self, save_path):
dst = os.path.join(save_path, script)
#logger.info(f"creating recovery script {dst}")
copyfile(src, dst)
# make executable
os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
self._change_recovery_script_permissions(dst)

def _change_recovery_script_permissions(self, dst):
# make executable (safeguard for file shares - Azure as example)
try:
os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
except PermissionError as e:
#this message is used in unit test TestZeRONonDistributed
logger.info(
f'Warning: Could not change permissions for {dst} due to error: {e}. Continuing without changing permissions.'
)

def _save_zero_checkpoint(self, save_path, tag):
zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag)
Expand Down
41 changes: 41 additions & 0 deletions tests/unit/checkpoint/test_zero_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# DeepSpeed Team

import deepspeed
from types import SimpleNamespace
from deepspeed.ops.op_builder import CPUAdamBuilder
from deepspeed.checkpoint.utils import clone_tensors_for_torch_save, get_model_ckpt_name_for_rank
from deepspeed.accelerator import get_accelerator
Expand Down Expand Up @@ -576,3 +577,43 @@ def test_save_tensor_clone(self, tmpdir, zero_stage, use_cpu_device):
torch.save(clone_state_dict, clone_ckpt_file)

compare_state_dicts(torch.load(ref_ckpt_file), torch.load(clone_ckpt_file))


class TestZeRONonDistributed(DistributedTest):
world_size = 1
init_distributed = False

@pytest.mark.parametrize('zero_stage', [1, 2, 3])
def test_chmod_exception_handling(self, monkeypatch, zero_stage):

config_dict = {
"optimizer": {
"type": "AdamW"
},
"train_batch_size": 1,
"zero_optimization": {
"stage": zero_stage
}
}
args = SimpleNamespace(local_rank=0)
net = SimpleModel(hidden_dim=4)
engine, _, _, _ = deepspeed.initialize(args=args,
config=config_dict,
model=net,
model_parameters=net.parameters())

log_called = False

def mock_logger_info(message, *args, **kwargs):
nonlocal log_called
log_called = True

monkeypatch.setattr("deepspeed.utils.logger.info", mock_logger_info)
"""
This is presented for use-cases like Azure Storage File Share (where permissions are not allowed)
We use a fake file for this test (file not existing would present a similar issue as not being able to chmod)
"""
fake_recovery_script_dst = os.path.join("tmp", "zero_to_fp32.py")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess for the test we'd need to add FileNotFoundError in order for this test to pass?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup caught it and pushed already - you were faster than my comment 😄

engine._change_recovery_script_permissions(fake_recovery_script_dst)

assert log_called, "Expected deepspeed.utils.logger.info to be called."