-
Notifications
You must be signed in to change notification settings - Fork 363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
chore: remove e2e_slurm_gpu series tests #10021
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,7 +47,6 @@ def wait_for_gc_to_finish(sess: api.Session, experiment_ids: List[int]) -> None: | |
|
||
|
||
@pytest.mark.e2e_gpu | ||
@pytest.mark.e2e_slurm_gpu | ||
def test_set_gc_policy() -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wut. Why was this ever a gpu test. Seriously. Instead of demoting this test to e2e_slurm, I moved the e2e_slurm to test_delete_checkpoints which actually tests that checkpoint_gc works. (I almost deleted |
||
sess = api_utils.user_session() | ||
save_exp_best = 3 | ||
|
@@ -121,6 +120,7 @@ def test_gc_checkpoints_lfs() -> None: | |
|
||
|
||
@pytest.mark.e2e_cpu | ||
@pytest.mark.e2e_slurm | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm pretty open to not even testing checkpoint gc on slurm. afaict the only way it actually fails is if bind mounts breaks, and that seems like we probably test it sufficiently elsewhere. But I added the test because we don't have a good way to expose the gc failure in general, so having one explicit test seems like the right choice. |
||
def test_delete_checkpoints() -> None: | ||
sess = api_utils.user_session() | ||
config = { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,6 @@ | |
|
||
|
||
@pytest.mark.e2e_gpu | ||
@pytest.mark.e2e_slurm_gpu | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is testing a rest API, which does not care about the resource manager. The only thing in python code that could be tested is "can you talk to a GPU", but I think there is basically zero chance that this test fails if gpu training succeeds. |
||
@pytest.mark.timeout(30 * 60) | ||
def test_streaming_observability_metrics_apis() -> None: | ||
sess = api_utils.user_session() | ||
|
This file was deleted.
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,14 +7,13 @@ | |
|
||
@pytest.mark.distributed | ||
@pytest.mark.gpu_required | ||
@pytest.mark.e2e_slurm_gpu | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can get behind testing our pytorch2 images (so I did not delete the whole test), but I see absolutely no reason why this needs to run on slurm. |
||
def test_pytorch2_hf_language_modeling_distributed() -> None: | ||
sess = api_utils.user_session() | ||
test_dir = "hf_language_modeling" | ||
|
||
config = conf.load_config(conf.hf_trainer_examples_path(f"{test_dir}/distributed.yaml")) | ||
config = conf.set_pt2_image(config) | ||
config = conf.set_slots_per_trial(config, 4) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Bring this gpu test in line with the slots per trial common to our other distributed gpu tests. |
||
config = conf.set_slots_per_trial(config, 8) | ||
|
||
# Our hardware GPUs have only 16gb memory, lower memory use with smaller batches. | ||
config = conf.set_entrypoint( | ||
|
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
from typing import Any, Dict, Tuple | ||
|
||
import torch.utils.data | ||
|
||
from determined import pytorch | ||
|
||
|
||
class MetricsCallback(pytorch.PyTorchCallback): | ||
def __init__(self): | ||
self.validation_metrics = [] | ||
|
||
def on_validation_end(self, metrics: Dict[str, Any]) -> None: | ||
self.validation_metrics.append(metrics) | ||
|
||
|
||
class IdentityDataset(torch.utils.data.Dataset): | ||
def __init__(self, initial_value: int = 1): | ||
self.initial_value = initial_value | ||
|
||
def __len__(self) -> int: | ||
return 64 | ||
|
||
def __getitem__(self, index: int) -> Tuple: | ||
v = float(self.initial_value + 0.1 * index) | ||
return torch.Tensor([v]), torch.Tensor([v]) | ||
|
||
|
||
class IdentityPyTorchTrial(pytorch.PyTorchTrial): | ||
def __init__(self, context: pytorch.PyTorchTrialContext) -> None: | ||
self.context = context | ||
|
||
model = torch.nn.Linear(1, 1, False) | ||
model.weight.data.fill_(0) | ||
self.model = context.wrap_model(model) | ||
|
||
self.lr = 0.001 | ||
|
||
optimizer = torch.optim.SGD(self.model.parameters(), self.lr) | ||
self.opt = context.wrap_optimizer(optimizer) | ||
|
||
self.loss_fn = torch.nn.MSELoss(reduction="mean") | ||
self.metrics_callback = MetricsCallback() | ||
|
||
def train_batch( | ||
self, batch: pytorch.TorchData, epoch_idx: int, batch_idx: int | ||
) -> Dict[str, torch.Tensor]: | ||
data, label = batch | ||
|
||
loss = self.loss_fn(self.model(data), label) | ||
|
||
self.context.backward(loss) | ||
|
||
self.context.step_optimizer(self.opt) | ||
|
||
return { | ||
"loss": loss, | ||
} | ||
|
||
def evaluate_batch(self, batch: pytorch.TorchData) -> Dict[str, Any]: | ||
data, label = batch | ||
|
||
loss = self.loss_fn(self.model(data), label) | ||
|
||
weight = self.model.weight.data.item() | ||
|
||
return {"val_loss": loss, "weight": weight} | ||
|
||
def build_training_data_loader(self) -> pytorch.DataLoader: | ||
return pytorch.DataLoader( | ||
IdentityDataset(), batch_size=self.context.get_per_slot_batch_size() | ||
) | ||
|
||
def build_validation_data_loader(self) -> pytorch.DataLoader: | ||
return pytorch.DataLoader( | ||
IdentityDataset(20), batch_size=self.context.get_per_slot_batch_size() | ||
) | ||
|
||
def build_callbacks(self) -> Dict[str, pytorch.PyTorchCallback]: | ||
return {"metrics": self.metrics_callback} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
does this need to be gpu even?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same for
test_gc_checkpoints
hereThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same for
test_s3_no_creds
(though it's being skipped it seems)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that might be a question for @stoksc, I think maybe that's to run those tests on fewer clusters?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wouldn't
e2e_cpu
cause it to run on fewer clusters?