Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix tests after huggingface_hub 0.24 #32054

Merged
merged 3 commits into from
Jul 19, 2024
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 35 additions & 23 deletions tests/trainer/test_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3739,47 +3739,59 @@ def get_commit_history(self, repo):

def test_push_to_hub_with_saves_each_epoch(self):
with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
push_to_hub=True,
hub_token=self._token,
# To avoid any flakiness if the training goes faster than the uploads.
hub_always_push=True,
save_strategy="epoch",
)
trainer.train()
with self.assertLogs(level="WARNING") as logs:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-epoch"),
push_to_hub=True,
hub_token=self._token,
# To avoid any flakiness if the training goes faster than the uploads.
hub_always_push=True,
save_strategy="epoch",
)
trainer.train()

commits = list_repo_commits(f"{USER}/test-trainer-epoch", token=self._token)
commits = [c.title for c in commits]
self.assertIn("initial commit", commits)
for i in range(1, 4):
self.assertIn(f"Training in progress, epoch {i}", commits)
self.assertIn("Training in progress, epoch 1", commits)
self.assertIn("Training in progress, epoch 2", commits)
# Epochs 3 and 4 are not guaranteed to be present (empty commits)
self.assertTrue(any("Skipping to prevent empty commit." in record.message for record in logs.records))

def test_push_to_hub_with_saves_each_n_steps(self):
num_gpus = max(1, backend_device_count(torch_device))
if num_gpus > 2:
self.skipTest(reason="More than 2 GPUs available")

with tempfile.TemporaryDirectory() as tmp_dir:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-step"),
push_to_hub=True,
hub_token=self._token,
# To avoid any flakiness if the training goes faster than the uploads.
hub_always_push=True,
save_strategy="steps",
save_steps=5,
)
trainer.train()
with self.assertLogs(level="WARNING") as logs:
trainer = get_regression_trainer(
output_dir=os.path.join(tmp_dir, "test-trainer-step"),
push_to_hub=True,
hub_token=self._token,
# To avoid any flakiness if the training goes faster than the uploads.
hub_always_push=True,
save_strategy="steps",
save_steps=5,
)
trainer.train()

commits = list_repo_commits(f"{USER}/test-trainer-step", token=self._token)
commits = [c.title for c in commits]
self.assertIn("initial commit", commits)

# Some commits are skipped if nothing has changed
# We expect 1 commit per 5 epochs + 1 commit at the end
nb_empty_commits = len(
[record for record in logs.records if "Skipping to prevent empty commit." in record.message]
)
nb_epoch_commits = len([commit for commit in commits if "Training in progress, step" in commit])

# max_steps depend on the number of available GPUs
max_steps = math.ceil(trainer.args.num_train_epochs * len(trainer.get_train_dataloader()))
for i in range(5, max_steps, 5):
self.assertIn(f"Training in progress, step {i}", commits)
nb_expected_commits = len(range(5, max_steps, 5))

self.assertGreaterEqual(nb_empty_commits + nb_epoch_commits, nb_expected_commits)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you leave a short comment why is it >=?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

addressed in ff09978


@require_tensorboard
def test_push_to_hub_with_tensorboard_logs(self):
Expand Down
Loading