From b6f1dd73487083524bc24acd369f13b61aabeced Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:17:52 -0500 Subject: [PATCH 001/273] print when saving checkpoint --- ocpmodels/common/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 2951078571..1938cf98b8 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -343,6 +343,7 @@ def save_checkpoint( state, checkpoint_dir="checkpoints/", checkpoint_file="checkpoint.pt" ): filename = os.path.join(checkpoint_dir, checkpoint_file) + print(f"Saving checkpoint to {filename}") torch.save(state, filename) From 5421e5972fda81423e43b317db96ef401131b5aa Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:18:35 -0500 Subject: [PATCH 002/273] log every n steps --- ocpmodels/common/flags.py | 6 ++++++ ocpmodels/trainers/single_trainer.py | 10 ++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 018b9dbe48..92dbb14421 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -241,6 +241,12 @@ def add_core_args(self): default=False, help="Don't copy LMDB data to $SLURM_TMPDIR and work from there", ) + self.parser.add_argument( + "--log_train_every", + type=int, + default=100, + help="Log training loss every n steps", + ) flags = Flags() diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index b6d1ae3c29..dc87854fee 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -247,10 +247,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): metrics={}, ) scale = self.scaler.get_scale() if self.scaler else 1.0 - for k, v in loss.items(): - self.metrics = self.evaluator.update( - k, v.item() / scale, self.metrics - ) + + if i_for_epoch % log_train_every == 0: + for k, v in loss.items(): + self.metrics = self.evaluator.update( + k, v.item() / scale, self.metrics + ) # Log metrics. self.log_train_metrics() From 46c8cdd7cd6ff2c53ecfc2dd53aa91dfc5ce21bd Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:18:45 -0500 Subject: [PATCH 003/273] add timing class --- ocpmodels/common/timer.py | 87 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 ocpmodels/common/timer.py diff --git a/ocpmodels/common/timer.py b/ocpmodels/common/timer.py new file mode 100644 index 0000000000..cb38f1c731 --- /dev/null +++ b/ocpmodels/common/timer.py @@ -0,0 +1,87 @@ +import torch +from time import time, sleep +from collections import defaultdict +import numpy as np + + +class Timer: + def __init__(self, name, store={}, gpu=False, ignore=False): + self.times = store + self.name = name + self.gpu = gpu + self.ignore = ignore + + def __enter__(self): + if self.ignore: + return self + if self.gpu: + self.start = torch.cuda.Event(enable_timing=True) + self.end = torch.cuda.Event(enable_timing=True) + self.start.record() + else: + self.start = time() + return self + + def __exit__(self, *args): + if self.ignore: + return + if self.gpu: + self.end.record() + torch.cuda.synchronize() + self.duration = self.start.elapsed_time(self.end) / 1000 + else: + self.end = time() + self.duration = self.end - self.start + self.times[self.name].append(self.duration) + + +class Times: + def __init__(self, gpu=False, ignore=False): + self.times = defaultdict(list) + self.timers = {} + self.gpu = gpu + self.ignore = ignore + + def reset(self): + self.times = defaultdict(list) + self.timers = {} + + def prepare_for_logging(self): + """ + Computes mean and standard deviation of all timers. + Returns a tuple: (mean_times_dict, std_times_dict) + + Returns: + tuple[dict]: a dict with mean times and a dict with std times + """ + mean_times = {} + std_times = {} + for k, v in self.times.items(): + mean_times[k] = np.mean(v) + std_times[k] = np.std(v) + return mean_times, std_times + + def next(self, name, ignore=None): + if "name" not in self.timers: + if ignore is None: + ignore = self.ignore + self.timers[name] = Timer(name, self.times, self.gpu, ignore) + return self.timers[name] + + +if __name__ == "__main__": + + times = Times(gpu=True) + with times.next("a"): + sleep(0.1) + with times.next("b"): + sleep(0.2) + with times.next("a"): + sleep(0.3) + with times.next("b"): + sleep(0.4) + with times.next("a"): + sleep(0.5) + with times.next("b"): + sleep(0.6) + print(times.prepare_for_logging()) From 1513bacc4296a6282bb02575fd42bfef061b193a Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:18:52 -0500 Subject: [PATCH 004/273] time validation --- ocpmodels/trainers/base_trainer.py | 48 +++++++++++++++++------------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index e983155c42..28b24a80df 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -44,6 +44,7 @@ from ocpmodels.modules.loss import DDPLoss, L2MAELoss from ocpmodels.modules.normalizer import Normalizer from ocpmodels.modules.scheduler import LRScheduler +from ocpmodels.common.timer import Times @registry.register_trainer("base") @@ -534,6 +535,7 @@ def validate( disable_tqdm=True, debug_batches=-1, is_final=False, + is_first=False, ): if distutils.is_master() and not self.silent: print() @@ -554,30 +556,34 @@ def validate( desc = "device {}".format(distutils.get_rank()) loader = self.loaders[split] - val_time = time.time() + times = Times(gpu=True) - for i, batch in enumerate(tqdm(loader, desc=desc, disable=disable_tqdm)): + with times.next("validation_loop"): - if self.sigterm: - return "SIGTERM" + for i, batch in enumerate(tqdm(loader, desc=desc, disable=disable_tqdm)): + + if self.sigterm: + return "SIGTERM" + + if debug_batches > 0 and i == debug_batches: + break - if debug_batches > 0 and i == debug_batches: - break + # Forward. + with torch.cuda.amp.autocast(enabled=self.scaler is not None): + with times.next("model_forward", ignore=not is_first): + preds = self.model_forward(batch) + loss = self.compute_loss(preds, batch) - # Forward. - with torch.cuda.amp.autocast(enabled=self.scaler is not None): - preds = self.model_forward(batch) + if preds.get("pooling_loss") is not None: + loss["total_loss"] += preds["pooling_loss"] - loss = self.compute_loss(preds, batch) - if preds.get("pooling_loss") is not None: - loss["total_loss"] += preds["pooling_loss"] + # Compute metrics. + metrics = self.compute_metrics(preds, batch, evaluator, metrics) + for k, v in loss.items(): + metrics = evaluator.update(k, v.item(), metrics) - # Compute metrics. - metrics = self.compute_metrics(preds, batch, evaluator, metrics) - for k, v in loss.items(): - metrics = evaluator.update(k, v.item(), metrics) + mean_val_times, std_val_times = times.prepare_for_logging() - val_time = time.time() - val_time aggregated_metrics = {} for k in metrics: aggregated_metrics[k] = { @@ -594,9 +600,11 @@ def validate( metrics = aggregated_metrics log_dict = {k: metrics[k]["metric"] for k in metrics} - log_dict.update({"epoch": self.epoch}) - log_dict.update({f"{split}_time": val_time}) - log_dict.update({f"{split}_n_samples": i + 1}) + log_dict["epoch"] = self.epoch + log_dict[f"{split}_time"] = mean_val_times["validation_loop"] + if is_first: + log_dict["model_forward_time_mean"] = mean_val_times["model_forward"] + log_dict["model_forward_time_std"] = std_val_times["model_forward"] if distutils.is_master() and not self.silent: log_str = ["{}: {:.4f}".format(k, v) for k, v in log_dict.items()] From 0dcea16185f70eb2fb4e5b9ff32810412f313709 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:18:57 -0500 Subject: [PATCH 005/273] time batch retrieval --- ocpmodels/trainers/single_trainer.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index dc87854fee..6f8c3a2ceb 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -25,6 +25,7 @@ from ocpmodels.modules.evaluator import Evaluator from ocpmodels.modules.normalizer import Normalizer from ocpmodels.trainers.base_trainer import BaseTrainer +from ocpmodels.common.timer import Times is_test_env = os.environ.get("ocp_test_env", False) @@ -190,10 +191,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): ) self.best_val_metric = np.inf current_val_metric = None + first_eval = True # Calculate start_epoch from step instead of loading the epoch number # to prevent inconsistencies due to different batch size in checkpoint. start_epoch = self.step // n_train + loader_times = Times() epoch_times = [] if not self.silent: @@ -211,6 +214,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): train_loader_iter = iter(self.loaders["train"]) self.model.train() i_for_epoch = 0 + log_train_every = self.config["log_train_every"] for i in range(skip_steps, n_train): if self.sigterm: @@ -220,7 +224,8 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): self.step = epoch_int * n_train + i + 1 # Get a batch. - batch = next(train_loader_iter) + with loader_times.time("get_batch"): + batch = next(train_loader_iter) # Forward, loss, backward. with torch.cuda.amp.autocast(enabled=self.scaler is not None): @@ -229,10 +234,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): if preds.get("pooling_loss") is not None: coeff = self.config["optim"].get("pooling_coefficient", 1) loss["total_loss"] += preds["pooling_loss"] * coeff + loss = { k: self.scaler.scale(v) if self.scaler else v for k, v in loss.items() } + if torch.isnan(loss["total_loss"]): print("\n\n >>> 🛑 Loss is NaN. Stopping training.\n\n") self.logger.add_tags(["nan_loss"]) @@ -254,8 +261,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): k, v.item() / scale, self.metrics ) - # Log metrics. - self.log_train_metrics() + # Log metrics. + gbm, gbs = loader_times.prepare_for_logging() + self.metrics["get_batch_time_mean"] = {"metric": gbm["get_batch"]} + self.metrics["get_batch_time_std"] = {"metric": gbs["get_batch"]} + loader_times.reset() + self.log_train_metrics() is_final_epoch = epoch_int == self.config["optim"]["max_epochs"] - 1 is_final_batch = (i == n_train - 1) or ( @@ -283,7 +294,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): split=self.config["dataset"]["default_val"], disable_tqdm=disable_eval_tqdm, debug_batches=debug_batches, + is_first=first_eval, ) + first_eval = False if val_metrics == "SIGTERM": return "SIGTERM" current_val_metric = val_metrics[primary_metric]["metric"] From 3c41f67bbd5f74af9024a70866740874ac1c5aeb Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:21:14 -0500 Subject: [PATCH 006/273] typo --- ocpmodels/trainers/single_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 6f8c3a2ceb..6d6c24a916 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -224,7 +224,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): self.step = epoch_int * n_train + i + 1 # Get a batch. - with loader_times.time("get_batch"): + with loader_times.next("get_batch"): batch = next(train_loader_iter) # Forward, loss, backward. From 64180f2f85241a856b9ba6a2a5d45b66f9002084 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:32:08 -0500 Subject: [PATCH 007/273] remove `energy_within_threshold` from qm9 and qm7x --- ocpmodels/modules/evaluator.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ocpmodels/modules/evaluator.py b/ocpmodels/modules/evaluator.py index 38f1833295..da01447b3d 100644 --- a/ocpmodels/modules/evaluator.py +++ b/ocpmodels/modules/evaluator.py @@ -56,12 +56,10 @@ class Evaluator: "qm9": [ "energy_mae", "energy_mse", - "energy_within_threshold", ], "qm7x": [ "energy_mae", "energy_mse", - "energy_within_threshold", ], } From c1b844622aeb20037d3d6207ff8a436049750c5d Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:32:28 -0500 Subject: [PATCH 008/273] initialize `signal` to `None` --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index e561a088e6..1d6a2b842d 100644 --- a/main.py +++ b/main.py @@ -134,7 +134,7 @@ def print_warnings(): if __name__ == "__main__": - ntfy = trainer = error = None + ntfy = trainer = error = signal = None setup_logging() From f5ca589ec067a3aa6520c7b85aa2a2ee797d7211 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:32:34 -0500 Subject: [PATCH 009/273] log epoch time --- ocpmodels/trainers/single_trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 6d6c24a916..23acb1f513 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -317,9 +317,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): # End of batch. # End of epoch. + epoch_times.append(time.time() - start_time) + self.metrics["epoch_time"] = {"metric": epoch_times[-1]} self.log_train_metrics(end_of_epoch=True) torch.cuda.empty_cache() - epoch_times.append(time.time() - start_time) # End of training. From 55471ab24cccc2ad2adcf4b47b4fc8ada6509947 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 18:59:00 -0500 Subject: [PATCH 010/273] fix log epoch_time --- ocpmodels/common/logger.py | 4 +++- ocpmodels/trainers/single_trainer.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index f526f686b7..1e329d59f0 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -91,7 +91,9 @@ class WandBLogger(Logger): def __init__(self, trainer_config): super().__init__(trainer_config) - wandb_id = "" + wandb_id = str(self.trainer_config.get("wandb_id", "")) + if wandb_id: + wandb_id += " - " slurm_jobid = os.environ.get("SLURM_JOB_ID") if slurm_jobid: wandb_id += f"{slurm_jobid}-" diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 23acb1f513..7c0137a6e3 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -583,14 +583,14 @@ def log_train_metrics(self, end_of_epoch=False): and distutils.is_master() and not self.is_hpo ) or (distutils.is_master() and end_of_epoch): - log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()] if not self.silent: + log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()] print( f"Train metrics at step {self.step}:\n > " + "\n > ".join(log_str) ) self.metrics = {} - if self.logger is not None and not end_of_epoch: + if self.logger is not None: # and not end_of_epoch: self.logger.log( log_dict, step=self.step, From 27f540e23e0d6a9e3b759a48c5fba3bf9b6d3e70 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 19:23:42 -0500 Subject: [PATCH 011/273] print log_train_every --- ocpmodels/trainers/single_trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 7c0137a6e3..38afa7d9ce 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -192,6 +192,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): self.best_val_metric = np.inf current_val_metric = None first_eval = True + log_train_every = self.config["log_train_every"] + + print("Logging train metrics every {} steps".format(log_train_every)) # Calculate start_epoch from step instead of loading the epoch number # to prevent inconsistencies due to different batch size in checkpoint. @@ -214,7 +217,6 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): train_loader_iter = iter(self.loaders["train"]) self.model.train() i_for_epoch = 0 - log_train_every = self.config["log_train_every"] for i in range(skip_steps, n_train): if self.sigterm: From ca4a173c63643b79f324f3545d52c2094d7f934b Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 20:01:31 -0500 Subject: [PATCH 012/273] fix print-every --- ocpmodels/common/flags.py | 11 ++--------- ocpmodels/trainers/single_trainer.py | 3 ++- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 92dbb14421..f487115b31 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -64,9 +64,9 @@ def add_core_args(self): ) self.parser.add_argument( "--print-every", - default=1000, + default=-1, type=int, - help="Log every N iterations (default: 10)", + help="Log every N iterations (default: -1 = end of epoch)", ) self.parser.add_argument( "--seed", default=0, type=int, help="Seed for torch, cuda, numpy" @@ -172,13 +172,6 @@ def add_core_args(self): default="", help="Comma-separated tags for wandb", ) - self.parser.add_argument( - "--print_every", - type=int, - default=-1, - help="Printing frequency (in steps). " - + "Default (-1) prints at the end of the epoch.", - ) self.parser.add_argument( "--wandb_project", type=str, diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 38afa7d9ce..9d3ac2d811 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -194,7 +194,8 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): first_eval = True log_train_every = self.config["log_train_every"] - print("Logging train metrics every {} steps".format(log_train_every)) + print(f"Logging train metrics every {log_train_every} steps") + print(f"Printing train metrics every {self.config['print_every']} steps") # Calculate start_epoch from step instead of loading the epoch number # to prevent inconsistencies due to different batch size in checkpoint. From 36d1eda51f24b64ff83c6cdf8b153f29bed35e91 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 4 Jan 2023 21:09:42 -0500 Subject: [PATCH 013/273] comment out step print --- ocpmodels/trainers/single_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 9d3ac2d811..388afa127c 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -269,6 +269,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): self.metrics["get_batch_time_mean"] = {"metric": gbm["get_batch"]} self.metrics["get_batch_time_std"] = {"metric": gbs["get_batch"]} loader_times.reset() + # logging.info(f"Step: {self.step}") self.log_train_metrics() is_final_epoch = epoch_int == self.config["optim"]["max_epochs"] - 1 From fbeb9f46764398ae11809617bbdc6f4c67a222b3 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 5 Jan 2023 00:02:26 -0500 Subject: [PATCH 014/273] write summary yaml --- launch_exp.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/launch_exp.py b/launch_exp.py index f1e043c7e1..16f5932b2b 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -58,6 +58,28 @@ def merge_dicts(dict1: dict, dict2: dict): return return_dict +def write_exp_yaml_and_jobs(exp_file, outfile, jobs): + """ + Reads the exp_file, adds the jobs as comments in each run line and writes the + resulting yaml file in the same directory as the outfile. + + Args: + exp_file (Path): Path to the experimental yaml file + outfile (Path): Path to the output txt file + jobs (list[str]): List of jobs, one per run line in the yaml exp_file + """ + lines = exp_file.read_text().splitlines() + run_line = lines.index("runs:") + j = 0 + for i, line in enumerate(lines[run_line:]): + if line.strip().startswith("- "): + lines[run_line + i] = f"{line} # {jobs[j]}" + j += 1 + yml_out = outfile.with_suffix(".yaml") + yml_out.write_text("\n".join(lines)) + return yml_out + + def get_commit(): try: commit = ( @@ -173,5 +195,9 @@ def cli_arg(args, key=""): f.write(text) print(f"Output written to {str(outfile)}") print("All job launched:", " ".join(jobs)) + yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs) + print( + "Experiment summary YAML in ", f"./{str(yml_out.relative_to(Path.cwd()))}" + ) else: print("Aborting") From 22ef8240584e50ff261b8745c35081010dc6643a Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 5 Jan 2023 00:02:53 -0500 Subject: [PATCH 015/273] update exp based on results from `2624343` and `2623710` --- configs/exps/qm7x/schnet.yaml | 83 ++++++++++++----------------------- 1 file changed, 28 insertions(+), 55 deletions(-) diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml index da55e6304d..1715b989cf 100644 --- a/configs/exps/qm7x/schnet.yaml +++ b/configs/exps/qm7x/schnet.yaml @@ -8,6 +8,7 @@ job: env: ocp-a100 default: + config: schnet-qm7x-all wandb_project: ocp-qm mode: train test_ri: true @@ -31,7 +32,7 @@ default: val_ood: std_divider: 10.0 optim: - batch_size: 32 + batch_size: 1024 warmup_steps: 3000 lr_initial: 0.0005 # parameters EMA @@ -39,62 +40,34 @@ default: decay_steps: 750000 decay_rate: 0.05 max_steps: 1000000 + model: + hidden_channels: 256 + num_filters: 256 + num_gaussians: 100 + num_interactions: 6 + cutoff: 5.0 runs: - - config: schnet-qm7x-all - model: - hidden_channels: 128 - num_gaussians: 20 - num_filters: 128 - num_interactions: 6 - cutoff: 5.0 - - - - config: schnet-qm7x-all - model: - hidden_channels: 256 - num_gaussians: 20 - num_filters: 256 - num_interactions: 6 - cutoff: 5.0 - - - config: schnet-qm7x-all - model: - hidden_channels: 256 - num_gaussians: 20 - num_filters: 256 - num_interactions: 6 - cutoff: 5.0 - optim: + - {} + - optim: + batch_size: 2048 + - optim: + batch_size: 4096 + - optim: lr_initial: 0.001 - - - config: schnet-qm7x-all - model: - hidden_channels: 512 - num_gaussians: 20 - num_filters: 256 - num_interactions: 6 - cutoff: 5.0 - optim: - lr_initial: 0.005 - - - config: schnet-qm7x-all - model: - hidden_channels: 128 - num_gaussians: 20 - num_filters: 128 + - optim: + lr_initial: 0.001 + batch_size: 2048 + - model: + num_gaussians: 200 + - model: + hidden_channels: 1024 + - model: + num_filters: 1024 + - model: + num_interactions: 8 + - model: num_interactions: 6 - cutoff: 5.0 - optim: - lr_initial: 0.0002 - - - config: schnet-qm7x-all - model: - hidden_channels: 512 num_gaussians: 20 - num_filters: 512 - num_interactions: 6 - cutoff: 5.0 - optim: - batch_size: 128 - lr_initial: 0.0001 \ No newline at end of file + num_filters: 64 + hidden_channels: 1024 \ No newline at end of file From 45904b20470befafcd7dbd0d238e365f3eaf3210 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 5 Jan 2023 00:18:35 -0500 Subject: [PATCH 016/273] v0 fanets QMs --- configs/exps/qm7x/fanet.yaml | 106 +++++++++++++++++++++++++++++++++++ configs/exps/qm9/fanet.yaml | 99 ++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) create mode 100644 configs/exps/qm7x/fanet.yaml create mode 100644 configs/exps/qm9/fanet.yaml diff --git a/configs/exps/qm7x/fanet.yaml b/configs/exps/qm7x/fanet.yaml new file mode 100644 index 0000000000..0dd40f39c1 --- /dev/null +++ b/configs/exps/qm7x/fanet.yaml @@ -0,0 +1,106 @@ +# trainset has 4068193 samples +job: + mem: 48GB + cpus: 8 + gres: gpu:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 + env: ocp-a100 + +default: + config: fanet-qm7x-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x, std/10 + frame_averaging: "" + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, mp_type, edge_embed_type + optim: batch_size, lr_initial + dataset: + train: + std_divider: 10.0 + val_id: + std_divider: 10.0 + val_ood: + std_divider: 10.0 + optim: + batch_size: 1024 + warmup_steps: 3000 + lr_initial: 0.0005 + # parameters EMA + ema_decay: 0.999 + decay_steps: 750000 + decay_rate: 0.05 + max_steps: 1000000 + model: + # PhAST + phys_embeds: False + phys_hidden_channels: 0 + energy_head: False # "weighted-av-init-embeds", "weighted-av-final-embeds" + pg_hidden_channels: 0 + tag_hidden_channels: 0 + # archi + hidden_channels: 256 + num_filters: 256 + num_gaussians: 32 + num_interactions: 4 + cutoff: 6.0 + regress_forces: False + # fanet + skip_co: False # output skip connections + second_layer_MLP: False # in EmbeddingBlock + edge_embed_type: rij # {'rij','all_rij','sh', 'all'}) + mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} + force_decoder_type: "simple" # can be {"" or "simple"} | only used if regress_forces is True + force_decoder_model_config: + simple: + hidden_channels: 128 + +runs: + - {} + - model: + pg_hidden_channels: 64 + - model: + energy_head: "weighted-av-init-embeds" + - model: + phys_embeds: True + - model: + pg_hidden_channels: 64 + phys_embeds: True + energy_head: "weighted-av-init-embeds" + - model: + mp_type: base + edge_embed_type: all_rij + - model: + mp_type: base + edge_embed_type: sh + - model: + mp_type: base + edge_embed_type: all + - model: + mp_type: simple + edge_embed_type: rij + - model: + mp_type: simple + edge_embed_type: all_rij + - model: + mp_type: simple + edge_embed_type: sh + - model: + mp_type: simple + edge_embed_type: all + - model: + mp_type: updownscale + edge_embed_type: rij + - model: + mp_type: updownscale + edge_embed_type: all_rij + - model: + mp_type: updownscale + edge_embed_type: sh + - model: + mp_type: updownscale + edge_embed_type: all diff --git a/configs/exps/qm9/fanet.yaml b/configs/exps/qm9/fanet.yaml new file mode 100644 index 0000000000..b258c5c3df --- /dev/null +++ b/configs/exps/qm9/fanet.yaml @@ -0,0 +1,99 @@ +# trainset has 4068193 samples +job: + mem: 48GB + cpus: 8 + gres: gpu:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 + env: ocp-a100 + +default: + config: fanet-qm9-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm9, std/10 + frame_averaging: "" + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions + optim: batch_size, lr_initial + optim: + batch_size: 1024 + warmup_steps: 3000 + lr_initial: 0.0005 + # parameters EMA + ema_decay: 0.999 + decay_steps: 750000 + decay_rate: 0.05 + max_steps: 1000000 + model: + # PhAST + phys_embeds: False + phys_hidden_channels: 0 + energy_head: False # "weighted-av-init-embeds", "weighted-av-final-embeds" + pg_hidden_channels: 0 + tag_hidden_channels: 0 + # archi + hidden_channels: 256 + num_filters: 256 + num_gaussians: 32 + num_interactions: 4 + cutoff: 6.0 + regress_forces: False + # fanet + skip_co: False # output skip connections + second_layer_MLP: False # in EmbeddingBlock + edge_embed_type: rij # {'rij','all_rij','sh', 'all'}) + mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} + force_decoder_type: "simple" # can be {"" or "simple"} | only used if regress_forces is True + force_decoder_model_config: + simple: + hidden_channels: 128 + +runs: + - {} + - model: + pg_hidden_channels: 64 + - model: + energy_head: "weighted-av-init-embeds" + - model: + phys_embeds: True + - model: + pg_hidden_channels: 64 + phys_embeds: True + energy_head: "weighted-av-init-embeds" + - model: + mp_type: base + edge_embed_type: all_rij + - model: + mp_type: base + edge_embed_type: sh + - model: + mp_type: base + edge_embed_type: all + - model: + mp_type: simple + edge_embed_type: rij + - model: + mp_type: simple + edge_embed_type: all_rij + - model: + mp_type: simple + edge_embed_type: sh + - model: + mp_type: simple + edge_embed_type: all + - model: + mp_type: updownscale + edge_embed_type: rij + - model: + mp_type: updownscale + edge_embed_type: all_rij + - model: + mp_type: updownscale + edge_embed_type: sh + - model: + mp_type: updownscale + edge_embed_type: all From b09db3881a77026d02c55be6d7e56c8afa98a8bb Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 5 Jan 2023 11:22:49 -0500 Subject: [PATCH 017/273] add min 16GB memory in GPU gres --- configs/exps/qm7x/fanet.yaml | 2 +- configs/exps/qm7x/schnet.yaml | 2 +- configs/exps/qm9/baselines.yaml | 2 +- configs/exps/qm9/fanet.yaml | 2 +- configs/exps/qm9/sfarinet.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/exps/qm7x/fanet.yaml b/configs/exps/qm7x/fanet.yaml index 0dd40f39c1..627caca163 100644 --- a/configs/exps/qm7x/fanet.yaml +++ b/configs/exps/qm7x/fanet.yaml @@ -2,7 +2,7 @@ job: mem: 48GB cpus: 8 - gres: gpu:1 + gres: gpu:16gb:1 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 env: ocp-a100 diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml index 1715b989cf..65d4d6654f 100644 --- a/configs/exps/qm7x/schnet.yaml +++ b/configs/exps/qm7x/schnet.yaml @@ -2,7 +2,7 @@ job: mem: 48GB cpus: 8 - gres: gpu:1 + gres: gpu:16gb:1 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 env: ocp-a100 diff --git a/configs/exps/qm9/baselines.yaml b/configs/exps/qm9/baselines.yaml index 2cafb1c69e..f43d553446 100644 --- a/configs/exps/qm9/baselines.yaml +++ b/configs/exps/qm9/baselines.yaml @@ -2,7 +2,7 @@ job: mem: 48GB cpus: 4 - gres: gpu:1 + gres: gpu:16gb:1 time: 24:00:00 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 diff --git a/configs/exps/qm9/fanet.yaml b/configs/exps/qm9/fanet.yaml index b258c5c3df..fedb248329 100644 --- a/configs/exps/qm9/fanet.yaml +++ b/configs/exps/qm9/fanet.yaml @@ -2,7 +2,7 @@ job: mem: 48GB cpus: 8 - gres: gpu:1 + gres: gpu:16gb:1 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 env: ocp-a100 diff --git a/configs/exps/qm9/sfarinet.yaml b/configs/exps/qm9/sfarinet.yaml index 701a760143..2f3a3104cb 100644 --- a/configs/exps/qm9/sfarinet.yaml +++ b/configs/exps/qm9/sfarinet.yaml @@ -1,7 +1,7 @@ job: mem: 48GB cpus: 4 - gres: gpu:1 + gres: gpu:16gb:1 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 env: ocp-a100 From 6924a32e7f99dd0e7c6612894e00109d289679b8 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 5 Jan 2023 12:22:46 -0500 Subject: [PATCH 018/273] more info when overriding max_epochs --- ocpmodels/trainers/base_trainer.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 28b24a80df..c188f0d32b 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -215,6 +215,7 @@ def load_datasets(self): transform = get_transforms(self.config) # TODO: train/val/test behavior batch_size = self.config["optim"]["batch_size"] + max_steps = self.config["optim"].get("max_steps", -1) for split, ds_conf in self.config["dataset"].items(): if split == "default_val": @@ -227,23 +228,21 @@ def load_datasets(self): shuffle = False if split == "train": shuffle = True - if self.config["optim"].get("max_steps"): + if max_steps > 0: if self.config["optim"].get("max_epochs", -1) > 0: print( "WARNING: Both max_steps and max_epochs are set.", "Using max_steps.", ) self.config["optim"]["max_epochs"] = int( - np.ceil( - self.config["optim"]["max_steps"] - / np.ceil(len(self.datasets[split]) / batch_size) - ) + np.ceil(max_steps / (len(self.datasets[split]) / batch_size)) ) print( "Setting max_epochs to", self.config["optim"]["max_epochs"], - f"from max_steps ({self.config['optim']['max_steps']})", - f"and batch_size ({self.config['optim']['batch_size']})\n", + f"from max_steps ({max_steps}),", + f"dataset length ({len(self.datasets[split])}),", + f"and batch_size ({batch_size})\n", ) self.samplers[split] = self.get_sampler( From 8314c023c4a70e44f36ea22097ec280e8d2418e0 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 6 Jan 2023 06:18:13 -0500 Subject: [PATCH 019/273] update configs --- configs/exps/gnn/test-gnn-1.yaml | 44 ++++++++++ configs/exps/gnn/test-gnn-all-1.yaml | 31 +++++++ configs/exps/prop-check/symmetries.yaml | 59 +++++++++++-- configs/exps/prop-check/symmetries_is2re.yaml | 37 ++++---- configs/models/fanet.yaml | 84 ++++++++++++++----- ocpmodels/models/fanet.py | 2 +- 6 files changed, 206 insertions(+), 51 deletions(-) create mode 100644 configs/exps/gnn/test-gnn-1.yaml create mode 100644 configs/exps/gnn/test-gnn-all-1.yaml diff --git a/configs/exps/gnn/test-gnn-1.yaml b/configs/exps/gnn/test-gnn-1.yaml new file mode 100644 index 0000000000..223f94d929 --- /dev/null +++ b/configs/exps/gnn/test-gnn-1.yaml @@ -0,0 +1,44 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + optim: + lr_initial: 0.002 + lr_gamma: 0.07 + warmup_steps: 500 + wandb_tags: 'test-fanet' + +runs: + - config: fanet-is2re-10k + note: 'Base rij FANet GNNs' + model: + mp_type: base + edge_embed_type: rij + - config: fanet-is2re-10k + note: 'Updownscale all-embeds FANet GNNs' + model: + mp_type: updownscale + edge_embed_type: all + - config: fanet-is2re-10k + note: 'Simple SH FANet GNNs' + model: + mp_type: simple + edge_embed_type: sh + - config: fanet-is2re-10k + note: 'Simple skip-co 2-layers FANet GNNs' + model: + skip_co: True + second_layer_MLP: True + edge_embed_type: all_rij diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml new file mode 100644 index 0000000000..65fd1a50c8 --- /dev/null +++ b/configs/exps/gnn/test-gnn-all-1.yaml @@ -0,0 +1,31 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:4 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + wandb_tags: 'test-fanet' + +runs: + - config: fanet-is2re-all + note: 'Simple rij baseline' + model: + mp_type: simple + edge_embed_type: rij + optim: + lr_initial: 0.0007 + - config: fanet-is2re-all + note: 'Simple rij baseline' + model: + mp_type: simple + edge_embed_type: rij diff --git a/configs/exps/prop-check/symmetries.yaml b/configs/exps/prop-check/symmetries.yaml index cfdecbb322..0b26ce2d81 100644 --- a/configs/exps/prop-check/symmetries.yaml +++ b/configs/exps/prop-check/symmetries.yaml @@ -3,7 +3,7 @@ job: cpus: 4 gres: gpu:rtx8000:4 partition: long - time: 48:00:00 + time: 40:00:00 default: test_ri: True @@ -14,33 +14,76 @@ default: tag_hidden_channels: 64 pg_hidden_channels: 0 # shall have been 32 energy_head: False # False ? - regress_forces: from_energy optim: max_epochs: 5 wandb_tags: 'prop-check-ICLM' runs: + - config: sfarinet-s2ef-2M + note: 'Baseline 5 epochs' + model: + regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 2D fa_frames: all + model: + regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 3D fa_frames: all + model: + regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' - frame_averaging: 3D - fa_frames: random + frame_averaging: DA + model: + regress_forces: from_energy + - config: sfarinet-s2ef-2M + note: 'Test Force Equivariance' + frame_averaging: 2D + fa_frames: det + model: + regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 2D - fa_frames: random + fa_frames: se3-det + model: + regress_forces: from_energy - config: sfarinet-s2ef-2M - note: 'Test Forces SE(3)-Equivariance' + note: 'Test Force Equivariance' frame_averaging: 2D - fa_frames: se3-all + fa_frames: all + model: + regress_forces: direct - config: sfarinet-s2ef-2M - note: 'Test Forces SE(3)-Equivariance' + note: 'Test Force Equivariance' frame_averaging: 2D fa_frames: se3-random + model: + regress_forces: direct + - config: sfarinet-s2ef-2M + note: 'Test Force Equivariance' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: direct_with_gradient_target + - config: sfarinet-s2ef-2M + note: 'Test Force Equivariance' + frame_averaging: DA + model: + regress_forces: direct_with_gradient_target + - config: sfarinet-s2ef-2M + note: 'No forces coefficient ! Only energy' + model: + regress_forces: direct + force_coefficient: 0 + energy_grad_coefficient: 10 + - config: sfarinet-s2ef-2M + note: 'Large energy grad coef' + frame_averaging: DA + model: + regress_forces: direct_with_gradient_target + energy_grad_coefficient: 50 diff --git a/configs/exps/prop-check/symmetries_is2re.yaml b/configs/exps/prop-check/symmetries_is2re.yaml index 1d07dab61f..d4553cf6f7 100644 --- a/configs/exps/prop-check/symmetries_is2re.yaml +++ b/configs/exps/prop-check/symmetries_is2re.yaml @@ -1,5 +1,5 @@ job: - mem: 32GB + mem: 48GB cpus: 4 gres: gpu:rtx8000:4 partition: long @@ -17,36 +17,35 @@ default: wandb_tags: 'prop-check-ICLM' runs: - - config: sfarinet-is2re-all - note: 'Baseline' - - config: sfarinet-is2re-all - note: 'Test 2D all symmetries IS2RE + FA correctness' - frame_averaging: 2D - fa_frames: all - config: sfarinet-is2re-all note: 'Test 3D all symmetries IS2RE + FA correctness' - frame_averaging: 3D + frame_averaging: DA fa_frames: all - config: sfarinet-is2re-all note: 'Test 3D se3-all symmetries IS2RE + FA correctness' frame_averaging: 3D fa_frames: se3-all - config: sfarinet-is2re-all - note: 'Test 2D random symmetries IS2RE + FA correctness' + note: '2D det symmetries IS2RE + FA correctness' frame_averaging: 2D - fa_frames: random + fa_frames: det - config: sfarinet-is2re-all - note: 'Test 2D se3-random symmetries IS2RE + FA correctness' - frame_averaging: 2D - fa_frames: se3-random + note: '3D det symmetries IS2RE + FA correctness' + frame_averaging: 3D + fa_frames: det - config: sfarinet-is2re-all - note: 'Test 2D random symmetries IS2RE + FA correctness more epochs' + note: '2D se3-random 30 epochs symmetries IS2RE + FA correctness' frame_averaging: 2D - fa_frames: random - optim: + fa_frames: se3-random + optim: max_epochs: 30 - config: sfarinet-is2re-all - note: 'Test invariance of DA more epochs' - optim: + note: 'Baseline 30 epochs symmetries IS2RE + FA correctness' + optim: max_epochs: 30 - frame_averaging: 'DA' + - config: sfarinet-is2re-all + note: '2D all 30 epochs symmetries IS2RE + FA correctness' + frame_averaging: 2D + fa_frames: all + optim: + max_epochs: 30 \ No newline at end of file diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml index c03ba706cc..9840d6432f 100644 --- a/configs/models/fanet.yaml +++ b/configs/models/fanet.yaml @@ -2,8 +2,8 @@ default: model: name: fanet act: swish - hidden_channels: 256 - num_filters: 128 + hidden_channels: 128 + num_filters: 100 num_interactions: 3 num_gaussians: 100 cutoff: 6.0 @@ -20,11 +20,12 @@ default: second_layer_MLP: False # in EmbeddingBlock edge_embed_type: rij # {'rij','all_rij','sh', 'all'}) mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} - force_decoder_type: "simple" # can be {"" or "simple"} | only used if regress_forces is True + force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True force_decoder_model_config: simple: hidden_channels: 128 - + mlp: + hidden_channels: 256 optim: batch_size: 64 eval_batch_size: 64 @@ -33,6 +34,9 @@ default: lr_initial: 0.001 warmup_factor: 0.2 max_epochs: 20 + energy_grad_coefficient: 10 + force_coefficient: 30 + energy_coefficient: 1 frame_averaging: False # 2D, 3D, da, False fa_frames: False # can be {None, full, random, det, e3, e3-random, e3-det} @@ -53,6 +57,8 @@ is2re: max_epochs: 20 100k: + model: + hidden_channels: 256 optim: lr_initial: 0.005 lr_milestones: # epochs at which lr_initial <- lr_initial * lr_gamma @@ -63,14 +69,17 @@ is2re: max_epochs: 20 all: + model: + hidden_channels: 384 + num_interactions: 4 optim: lr_initial: 0.001 lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma - - 16000 - - 25000 - - 35000 + - 18000 + - 27000 + - 37000 warmup_steps: 5394 - max_epochs: 17 + max_epochs: 20 # ------------------ # ----- S2EF ----- @@ -79,15 +88,27 @@ is2re: s2ef: default: model: + num_interactions: 4 + hidden_channels: 750 + num_gaussians: 200 + num_filters: 256 regress_forces: "direct" force_coefficient: 30 energy_grad_coefficient: 10 optim: - warmup_steps: 5394 + batch_size: 48 + eval_batch_size: 48 + warmup_steps: 25000 + warmup_factor: 0.2 + lr_gamma: 0.1 + lr_initial: 0.0002 + max_epochs: 20 + warmup_steps: 20000 lr_milestones: - - 17981 - - 26972 - - 35963 + - 50000 + - 70000 + - 90000 + 200k: {} 2M: {} @@ -98,28 +119,45 @@ s2ef: qm9: default: + model: + hidden_channels: 150 + num_gaussians: 100 + num_filters: 128 + num_interactions: 6 + cutoff: 5.0 optim: + batch_size: 1024 lr_initial: 0.001 - lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma - - 16000 - - 25000 - - 35000 - warmup_steps: 5394 - max_epochs: 17 + max_epochs: 1000 + decay_steps: 125000 + decay_rate: 0.01 + ema_decay: 0.999 + lr_gamma: 0.25 + lr_milestones: + - 17981 + - 26972 + - 35963 + - 52000 + - 100000 + warmup_steps: 1000 10k: {} all: {} qm7x: default: + model: + hidden_channels: 384 + num_interactions: 4 + optim: lr_initial: 0.001 - lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma - - 16000 - - 25000 - - 35000 + lr_milestones: + - 17981 + - 26972 + - 35963 warmup_steps: 5394 - max_epochs: 20 + max_epochs: 17 all: {} 1k: {} diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 17deea8eac..4a4cca9a5e 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -408,7 +408,7 @@ class FANet(BaseModel): of the edge embedding block. edge_embed_hidden (int): size of edge representation. could be num_filters or hidden_channels. - mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env}): + mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}): specificies the MP of the interaction block. """ From 4c5f0556e1832bb50067a2803d62dfd364db96fe Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 6 Jan 2023 08:25:57 -0500 Subject: [PATCH 020/273] fix no gradient issue FANet 2 layers --- configs/exps/gnn/test-gnn-1.yaml | 3 ++- ocpmodels/models/fanet.py | 15 +++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/configs/exps/gnn/test-gnn-1.yaml b/configs/exps/gnn/test-gnn-1.yaml index 223f94d929..5c3b23f4ab 100644 --- a/configs/exps/gnn/test-gnn-1.yaml +++ b/configs/exps/gnn/test-gnn-1.yaml @@ -15,9 +15,10 @@ default: pg_hidden_channels: 0 # shall have been 32 energy_head: 'weighted-av-initial-embeds' # False ? optim: - lr_initial: 0.002 + lr_initial: 0.0035 lr_gamma: 0.07 warmup_steps: 500 + max_epochs: 25 wandb_tags: 'test-fanet' runs: diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 4a4cca9a5e..37a58f61f2 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -114,23 +114,22 @@ def __init__( # TODO: change some num_filters to edge_embed_hidden if self.edge_embed_type == "rij": self.lin_e1 = Linear(3, num_filters) - self.lin_e2 = Linear(num_filters, num_filters) elif self.edge_embed_type == "all_rij": self.lin_e1 = Linear(3, num_filters // 3) # r_ij self.lin_e12 = Linear(3, num_filters // 3) # norm r_ij self.lin_e13 = Linear( num_gaussians, num_filters - 2 * (num_filters // 3) ) # d_ij - self.lin_e2 = Linear(num_filters, num_filters) # mlp of concat elif self.edge_embed_type == "sh": self.lin_e1 = Linear(15, num_filters) - self.lin_e2 = Linear(num_filters, num_filters) elif self.edge_embed_type == "all": self.lin_e1 = Linear(18, num_filters) - self.lin_e2 = Linear(num_filters, num_filters) else: raise ValueError("edge_embedding_type does not exist") + if self.second_layer_MLP: + self.lin_e2 = Linear(num_filters, num_filters) + self.reset_parameters() def reset_parameters(self): @@ -144,13 +143,13 @@ def reset_parameters(self): self.group_embedding.reset_parameters() nn.init.xavier_uniform_(self.lin.weight) self.lin.bias.data.fill_(0) + nn.init.xavier_uniform_(self.lin_e1.weight) + self.lin_e1.bias.data.fill_(0) if self.second_layer_MLP: nn.init.xavier_uniform_(self.lin_2.weight) self.lin_2.bias.data.fill_(0) - nn.init.xavier_uniform_(self.lin_e1.weight) - self.lin_e1.bias.data.fill_(0) - nn.init.xavier_uniform_(self.lin_e2.weight) - self.lin_e2.bias.data.fill_(0) + nn.init.xavier_uniform_(self.lin_e2.weight) + self.lin_e2.bias.data.fill_(0) if self.edge_embed_type == "all_rij": nn.init.xavier_uniform_(self.lin_e12.weight) self.lin_e12.bias.data.fill_(0) From 7a4f77323eb86a869f953eb24d3bba054f911c09 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 6 Jan 2023 11:51:12 -0500 Subject: [PATCH 021/273] add LinearWarmupCosineAnnealingLR --- ocpmodels/modules/scheduler.py | 38 ++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index 223ec24447..cf8cae1b64 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -3,6 +3,7 @@ import torch.optim.lr_scheduler as lr_scheduler from ocpmodels.common.utils import warmup_lr_lambda +import pytorch_warmup as warmup class LRScheduler: @@ -20,20 +21,31 @@ class LRScheduler: optimizer (obj): torch optim object """ - def __init__(self, optimizer, config): + def __init__(self, optimizer, optim_config): self.optimizer = optimizer - self.config = config.copy() - if "scheduler" in self.config: - self.scheduler_type = self.config["scheduler"] + self.optim_config = optim_config.copy() + self.warmup_scheduler = None + if "scheduler" in self.optim_config: + self.scheduler_type = self.optim_config["scheduler"] else: self.scheduler_type = "LambdaLR" - scheduler_lambda_fn = lambda x: warmup_lr_lambda(x, self.config) - self.config["lr_lambda"] = scheduler_lambda_fn + scheduler_lambda_fn = lambda x: warmup_lr_lambda(x, self.optim_config) + self.optim_config["lr_lambda"] = scheduler_lambda_fn - if self.scheduler_type != "Null": + if ( + self.scheduler_type != "Null" + and self.scheduler_type != "LinearWarmupCosineAnnealingLR" + ): self.scheduler = getattr(lr_scheduler, self.scheduler_type) - scheduler_args = self.filter_kwargs(config) + scheduler_args = self.filter_kwargs(optim_config) self.scheduler = self.scheduler(optimizer, **scheduler_args) + elif self.scheduler_type == "WarmupCosineAnnealingLR": + self.warmup_scheduler = warmup.ExponentialWarmup( + self.optimizer, warmup_period=optim_config["warmup_steps"] + ) + self.scheduler = lr_scheduler.CosineAnnealingLR( + self.optimizer, T_max=optim_config["max_steps"], eta_min=1e-7 + ) def step(self, metrics=None, epoch=None): if self.scheduler_type == "Null": @@ -43,9 +55,13 @@ def step(self, metrics=None, epoch=None): raise Exception("Validation set required for ReduceLROnPlateau.") self.scheduler.step(metrics) else: - self.scheduler.step() + if self.warmup_scheduler: + with self.warmup_scheduler.dampening(): + self.scheduler.step(epoch) + else: + self.scheduler.step() - def filter_kwargs(self, config): + def filter_kwargs(self, optim_config): # adapted from https://stackoverflow.com/questions/26515595/ sig = inspect.signature(self.scheduler) filter_keys = [ @@ -55,7 +71,7 @@ def filter_kwargs(self, config): ] filter_keys.remove("optimizer") scheduler_args = { - arg: self.config[arg] for arg in self.config if arg in filter_keys + arg: optim_config[arg] for arg in optim_config if arg in filter_keys } return scheduler_args From 261cc5b3ee6914b5138cce903026d99d40aca4cf Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 7 Jan 2023 10:37:59 -0500 Subject: [PATCH 022/273] update configs --- configs/exps/qm7x/fanet.yaml | 24 ++++++++++++------------ configs/exps/qm7x/schnet.yaml | 15 ++++----------- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/configs/exps/qm7x/fanet.yaml b/configs/exps/qm7x/fanet.yaml index 627caca163..801c506565 100644 --- a/configs/exps/qm7x/fanet.yaml +++ b/configs/exps/qm7x/fanet.yaml @@ -12,23 +12,23 @@ default: wandb_project: ocp-qm mode: train test_ri: true - wandb_tags: qm7x, std/10 + wandb_tags: qm7x #, std/10 frame_averaging: "" cp_data_to_tmpdir: true note: task: name model: name, num_gaussians, hidden_channels, num_filters, num_interactions, mp_type, edge_embed_type optim: batch_size, lr_initial - dataset: - train: - std_divider: 10.0 - val_id: - std_divider: 10.0 - val_ood: - std_divider: 10.0 + # dataset: + # train: + # std_divider: 10.0 + # val_id: + # std_divider: 10.0 + # val_ood: + # std_divider: 10.0 optim: - batch_size: 1024 - warmup_steps: 3000 + batch_size: 2048 + warmup_steps: 1000 lr_initial: 0.0005 # parameters EMA ema_decay: 0.999 @@ -45,8 +45,8 @@ default: # archi hidden_channels: 256 num_filters: 256 - num_gaussians: 32 - num_interactions: 4 + num_gaussians: 100 + num_interactions: 6 cutoff: 6.0 regress_forces: False # fanet diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml index 65d4d6654f..73e5ace742 100644 --- a/configs/exps/qm7x/schnet.yaml +++ b/configs/exps/qm7x/schnet.yaml @@ -12,7 +12,7 @@ default: wandb_project: ocp-qm mode: train test_ri: true - wandb_tags: qm7x, std/10 + wandb_tags: qm7x phys_hidden_channels: 0 phys_embeds: False energy_head: False @@ -24,22 +24,15 @@ default: task: name model: name, num_gaussians, hidden_channels, num_filters, num_interactions optim: batch_size, lr_initial - dataset: - train: - std_divider: 10.0 - val_id: - std_divider: 10.0 - val_ood: - std_divider: 10.0 optim: - batch_size: 1024 - warmup_steps: 3000 + batch_size: 2048 + warmup_steps: 1000 lr_initial: 0.0005 # parameters EMA ema_decay: 0.999 decay_steps: 750000 decay_rate: 0.05 - max_steps: 1000000 + max_steps: 200000 model: hidden_channels: 256 num_filters: 256 From 9206f33b233b54a1027071ede44ddb96e9a77b60 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Mon, 9 Jan 2023 09:32:34 -0500 Subject: [PATCH 023/273] Force MAE + many config files --- configs/exps/gnn/test-gnn-all-1.yaml | 82 ++++++++++++++++++- configs/exps/icml/baseline_s2ef.yaml | 17 ++-- configs/exps/icml/test_params.yaml | 30 +++++++ .../exps/prop-check/symmetries_s2ef_2.yaml | 77 +++++++++++++++++ configs/models/dpp.yaml | 34 +++++--- configs/models/schnet.yaml | 8 +- ocpmodels/trainers/base_trainer.py | 24 ++++-- ocpmodels/trainers/single_trainer.py | 23 +++++- 8 files changed, 261 insertions(+), 34 deletions(-) create mode 100644 configs/exps/icml/test_params.yaml create mode 100644 configs/exps/prop-check/symmetries_s2ef_2.yaml diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml index 65fd1a50c8..ea5dd8ec56 100644 --- a/configs/exps/gnn/test-gnn-all-1.yaml +++ b/configs/exps/gnn/test-gnn-all-1.yaml @@ -4,6 +4,7 @@ job: gres: gpu:rtx8000:4 partition: long time: 20:00:00 + code_loc: /home/mila/a/alexandre.duval/ocp/ocp-test/ocp default: test_ri: True @@ -15,6 +16,8 @@ default: pg_hidden_channels: 0 # shall have been 32 energy_head: 'weighted-av-initial-embeds' # False ? wandb_tags: 'test-fanet' + optim: + lr_initial: 0.0008 runs: - config: fanet-is2re-all @@ -22,10 +25,83 @@ runs: model: mp_type: simple edge_embed_type: rij - optim: - lr_initial: 0.0007 + frame_averaging: 2D + fa_fames: random + - config: fanet-is2re-all + model: + mp_type: base + edge_embed_type: rij + frame_averaging: 2D + fa_fames: random + - config: fanet-is2re-all + model: + mp_type: base + edge_embed_type: all_rij + - config: fanet-is2re-all + model: + mp_type: base + edge_embed_type: all + - config: fanet-is2re-all + model: + mp_type: base + edge_embed_type: sh - config: fanet-is2re-all note: 'Simple rij baseline' model: - mp_type: simple + mp_type: updownscale + edge_embed_type: rij + - config: fanet-is2re-all + note: 'Simple rij baseline' + model: + mp_type: updownscale edge_embed_type: rij + - config: fanet-is2re-all + note: 'Simple rij baseline' + model: + mp_type: updownscale + edge_embed_type: all + optim: + lr_initial: 0.0007 + max_epochs: 25 + + - config: fanet-is2re-all + model: + mp_type: base + edge_embed_type: all + hidden_channels: 500 + num_interactions: 6 + num_filters: 200 + num_gaussians: 200 + frame_averaging: 2D + fa_fames: random + optim: + lr_initial: 0.0005 + max_epochs: 25 + + - config: fanet-is2re-all + model: + mp_type: base + edge_embed_type: all + hidden_channels: 500 + num_interactions: 6 + num_filters: 200 + num_gaussians: 200 + frame_averaging: 2D + fa_fames: random + optim: + lr_initial: 0.0002 + max_epochs: 25 + + - config: fanet-is2re-all + model: + mp_type: base + edge_embed_type: all + hidden_channels: 500 + num_interactions: 6 + num_filters: 200 + num_gaussians: 200 + frame_averaging: 2D + fa_fames: random + optim: + lr_initial: 0.0007 + max_epochs: 25 diff --git a/configs/exps/icml/baseline_s2ef.yaml b/configs/exps/icml/baseline_s2ef.yaml index fa66d3a6d5..71312e50b3 100644 --- a/configs/exps/icml/baseline_s2ef.yaml +++ b/configs/exps/icml/baseline_s2ef.yaml @@ -1,18 +1,25 @@ job: - mem: 32GB + mem: 48GB cpus: 4 - gres: gpu:rtx8000:4 + gres: gpu:rtx8000:1 partition: long - time: 36:00:00 + time: 42:00:00 default: test_ri: True mode: train - wandb_tags: 'Baseline 2 gpus' + wandb_tags: 'Baseline dpp 1 Gpus' runs: + - config: dpp-s2ef-2M + note: 'Baseline Schnet S2EF' + optim: + batch_size: 368 + eval_batch_size: 368 - config: schnet-s2ef-2M note: 'Baseline Schnet S2EF' optim: max_epochs: 15 - force_coefficient: 50 \ No newline at end of file + force_coefficient: 50 + batch_size: 192 + eval_batch_size: 192 diff --git a/configs/exps/icml/test_params.yaml b/configs/exps/icml/test_params.yaml new file mode 100644 index 0000000000..85a48351e5 --- /dev/null +++ b/configs/exps/icml/test_params.yaml @@ -0,0 +1,30 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: False # False ? + optim: + max_epochs: 10 + wandb_tags: 'prop-check-ICLM' + +runs: + - config: sfarinet-s2ef-2M + note: 'All No TMP 1 GPU with grad target' + model: + regress_forces: direct + optim: + batch_size: 192 + eval_batch_size: 192 + frame_averaging: 2D + fa_frames: all diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml new file mode 100644 index 0000000000..9abfc02b40 --- /dev/null +++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml @@ -0,0 +1,77 @@ +job: + mem: 48GB + cpus: 4 + gres: gpu:rtx8000:4 + partition: long + time: 40:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: False # False ? + optim: + max_epochs: 5 + wandb_tags: 'prop-check-ICLM' + +runs: + - config: sfarinet-s2ef-2M + note: 'Baseline 5 epochs' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: from_energy + - config: sfarinet-s2ef-2M + note: 'Baseline 5 epochs' + frame_averaging: 3D + fa_frames: all + model: + regress_forces: from_energy + + - config: sfarinet-s2ef-2M + note: '2D all gradient' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: direct_with_gradient_target + - config: sfarinet-s2ef-2M + note: '2d all no gradient' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: direct + + - config: sfarinet-s2ef-2M + note: 'Big energy grad coef' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: direct_with_gradient_target + optim: + energy_grad_coefficient: 100 + force_coefficient: 30 + energy_coefficient: 1 + - config: sfarinet-s2ef-2M + note: 'Big energy grad coef' + frame_averaging: 2D + fa_frames: random + model: + regress_forces: direct_with_gradient_target + optim: + energy_grad_coefficient: 100 + force_coefficient: 30 + energy_coefficient: 1 + - config: sfarinet-s2ef-2M + note: 'No energy coef' + frame_averaging: 2D + fa_frames: random + model: + regress_forces: direct_with_gradient_target + optim: + energy_grad_coefficient: 100 + force_coefficient: 30 + energy_coefficient: 0 diff --git a/configs/models/dpp.yaml b/configs/models/dpp.yaml index 75b50b8f9d..4b973595f4 100644 --- a/configs/models/dpp.yaml +++ b/configs/models/dpp.yaml @@ -87,31 +87,30 @@ s2ef: optim: num_workers: 8 eval_every: 10000 + 200k: optim: # *** Important note *** - # The total number of gpus used for this run was 16. + # The total number of gpus used for this run was 4. # If the global batch size (num_gpus * batch_size) is modified # the lr_milestones and warmup_steps need to be adjusted accordingly. - batch_size: 12 - eval_batch_size: 12 + batch_size: 48 + eval_batch_size: 48 lr_initial: 0.00001 lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma - 5208 - 8333 - 10416 warmup_steps: 3125 - max_epochs: 30 + max_epochs: 10 force_coefficient: 50 2M: optim: - # *** Important note *** - # The total number of gpus used for this run was 32. - # If the global batch size (num_gpus * batch_size) is modified - # the lr_milestones and warmup_steps need to be adjusted accordingly. - batch_size: 12 - eval_batch_size: 12 + batch_size: 96 + eval_batch_size: 96 + eval_every: 10000 + num_workers: 8 lr_initial: 0.0001 lr_gamma: 0.1 lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma @@ -119,8 +118,21 @@ s2ef: - 31250 - 41666 warmup_steps: 10416 - max_epochs: 15 + warmup_factor: 0.2 + max_epochs: 5 force_coefficient: 50 + model: + hidden_channels: 192 + out_emb_channels: 192 + num_blocks: 3 + cutoff: 6.0 + num_radial: 6 + num_spherical: 7 + num_before_skip: 1 + num_after_skip: 2 + num_output_layers: 3 + regress_forces: True + use_pbc: True 20M: optim: diff --git a/configs/models/schnet.yaml b/configs/models/schnet.yaml index 217d052cae..48b7fcc544 100644 --- a/configs/models/schnet.yaml +++ b/configs/models/schnet.yaml @@ -82,11 +82,11 @@ s2ef: num_gaussians: 200 optim: # *** Important note *** - # The total number of gpus used for this run was 8. + # The total number of gpus used for this run was 4. # If the global batch size (num_gpus * batch_size) is modified # the lr_milestones and warmup_steps need to be adjusted accordingly. - batch_size: 24 - eval_batch_size: 24 + batch_size: 48 + eval_batch_size: 48 num_workers: 16 lr_initial: 0.0001 lr_gamma: 0.1 @@ -95,7 +95,7 @@ s2ef: - 83333 - 104166 warmup_steps: 31250 - max_epochs: 30 + max_epochs: 20 force_coefficient: 100 200k: diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 28b24a80df..f2f69b9bb2 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -20,12 +20,12 @@ import torch.nn as nn import torch.optim as optim import yaml +from rich.console import Console +from rich.table import Table from torch.nn.parallel.distributed import DistributedDataParallel from torch.utils.data import DataLoader from torch_geometric.data import Batch from tqdm import tqdm -from rich.table import Table -from rich.console import Console from ocpmodels.common import distutils from ocpmodels.common.data_parallel import ( @@ -35,7 +35,8 @@ ) from ocpmodels.common.graph_transforms import RandomReflect, RandomRotate from ocpmodels.common.registry import registry -from ocpmodels.common.utils import get_commit_hash, save_checkpoint, JOB_ID +from ocpmodels.common.timer import Times +from ocpmodels.common.utils import JOB_ID, get_commit_hash, save_checkpoint from ocpmodels.datasets.data_transforms import FrameAveraging, get_transforms from ocpmodels.modules.evaluator import Evaluator from ocpmodels.modules.exponential_moving_average import ( @@ -44,7 +45,6 @@ from ocpmodels.modules.loss import DDPLoss, L2MAELoss from ocpmodels.modules.normalizer import Normalizer from ocpmodels.modules.scheduler import LRScheduler -from ocpmodels.common.timer import Times @registry.register_trainer("base") @@ -725,7 +725,8 @@ def eval_all_splits( """Evaluate model on all four validation splits""" cumulated_time = 0 - cumulated_mae = 0 + cumulated_energy_mae = 0 + cumulated_forces_mae = 0 metrics_dict = {} # store all non-train splits: all vals and test all_splits = [s for s in self.config["dataset"] if s.startswith("val")] @@ -759,7 +760,9 @@ def eval_all_splits( return "SIGTERM" metrics_dict[split] = self.metrics - cumulated_mae += self.metrics["energy_mae"]["metric"] + cumulated_energy_mae += self.metrics["energy_mae"]["metric"] + if self.config["model"].get("regress_forces", False): + cumulated_forces_mae += self.metrics["forces_mae"]["metric"] cumulated_time += time.time() - start_time if metrics_names is None: metrics_names = list(self.metrics.keys()) @@ -777,12 +780,15 @@ def eval_all_splits( # Log specific metrics if final and self.config["logger"] == "wandb" and distutils.is_master(): - overall_mae = cumulated_mae / len(all_splits) + overall_energy_mae = cumulated_energy_mae / len(all_splits) self.logger.log({"Eval time": cumulated_time}) - self.logger.log({"Overall MAE": overall_mae}) + self.logger.log({"Overall MAE": overall_energy_mae}) + if self.config["model"].get("regress_forces", False): + overall_forces_mae = cumulated_forces_mae / len(all_splits) + self.logger.log({"Overall Forces MAE": overall_forces_mae}) if self.logger.ntfy: self.logger.ntfy( - message=f"{JOB_ID} - Overall MAE: {overall_mae}", + message=f"{JOB_ID} - Overall MAE: {overall_energy_mae}", click=self.logger.url, ) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 388afa127c..3e8529f67e 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -202,6 +202,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): start_epoch = self.step // n_train loader_times = Times() epoch_times = [] + model_run_time = 0 if not self.silent: print("---Beginning of Training---") @@ -231,6 +232,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): batch = next(train_loader_iter) # Forward, loss, backward. + if epoch_int == 1: + s = time.time() + with torch.cuda.amp.autocast(enabled=self.scaler is not None): preds = self.model_forward(batch) loss = self.compute_loss(preds, batch) @@ -238,6 +242,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): coeff = self.config["optim"].get("pooling_coefficient", 1) loss["total_loss"] += preds["pooling_loss"] * coeff + if epoch_int == 1: + model_run_time += time.time() - s + loss = { k: self.scaler.scale(v) if self.scaler else v for k, v in loss.items() @@ -350,6 +357,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): log_epoch_times = True self.model_forward(batch) self.logger.log({"Batch time": time.time() - start_time}) + self.logger.log({"Model run time": model_run_time / len(self.train_loader)}) if log_epoch_times: self.logger.log({"Epoch time": sum(epoch_times) / len(epoch_times)}) @@ -666,8 +674,19 @@ def test_model_symmetries(self, debug_batches=-1): reflected = self.reflect_graph(batch) preds3 = self.model_forward(reflected["batch_list"]) energy_diff_refl += torch.abs(preds1["energy"] - preds3["energy"]).sum() - if self.task_name == "s2ef": - forces_diff_refl += torch.abs(preds1["forces"] - preds3["forces"]).sum() + if self.task_name == "s2ef": + forces_diff_refl += torch.abs( + preds1["forces"] @ reflected["rot"].to(preds1["forces"].device) + - preds3["forces"] + ).sum() + # assert torch.allclose( + # torch.abs( + # batch[0].force @ reflected["rot"].to(batch[0].force.device) + # - reflected["batch_list"][0].force # .to(batch[0].force.device) + # ).sum(), + # torch.tensor([0.0]), # .to(batch[0].force.device) + # atol=1e-05, + # ) # 3D Rotation and compute diff in prediction rotated = self.rotate_graph(batch) From 8d4947d31c15c2ac094ca416cf794b304709271b Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 14:23:25 -0500 Subject: [PATCH 024/273] various bug fixes from `orion` branch --- launch_exp.py | 4 +-- ocpmodels/common/utils.py | 51 +++++++++++++++--------------- ocpmodels/datasets/lmdb_dataset.py | 11 ++++--- ocpmodels/modules/scheduler.py | 13 +++++--- 4 files changed, 44 insertions(+), 35 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index 16f5932b2b..c2a16c76f5 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -29,7 +29,7 @@ def merge_dicts(dict1: dict, dict2: dict): Returns ------- - return_dict_and_duplicates: tuple(dict, list(str)) + return_dict: dict Merged dictionaries. """ if not isinstance(dict1, dict): @@ -51,7 +51,7 @@ def merge_dicts(dict1: dict, dict2: dict): f"List for key {k} has different length in dict1 and dict2." + " Use an empty dict {} to pad for items in the shorter list." ) - return_dict[k] = [merge_dicts(d1, d2)[0] for d1, d2 in zip(dict1[k], v)] + return_dict[k] = [merge_dicts(d1, d2) for d1, d2 in zip(dict1[k], v)] else: return_dict[k] = dict2[k] diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 1938cf98b8..fe2a524c46 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1,4 +1,4 @@ -""" +"""utils.py Copyright (c) Facebook, Inc. and its affiliates. This source code is licensed under the MIT license found in the @@ -50,26 +50,33 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config): ): return trainer_config + print("\nMoving data to slurm tmpdir", flush=True) + tmp_dir = Path(f"/Tmp/slurm.{JOB_ID}.0") for s, split in trainer_config["dataset"].items(): if not isinstance(split, dict): continue - new_dir = tmp_dir / Path(split["src"]).name + original = Path(split["src"]) + if original.is_file(): + original = original.parent + new_dir = tmp_dir / original.name if new_dir.exists(): print( f"Data already copied to {str(new_dir)} for split", f"{s} with source path {split['src']}", + flush=True, ) trainer_config["dataset"][s]["src"] = str(new_dir) continue + print("Making new_dir: ", str(new_dir), flush=True) new_dir.mkdir() - command = ["rsync", "-av", f'{split["src"]}/', str(new_dir)] - print("Copying data: ", " ".join(command)) + command = ["cp", "-r", f"{str(original)}", str(new_dir.parent)] + print("Copying data: ", " ".join(command), flush=True) subprocess.run(command) for f in new_dir.glob("*.lmdb-lock"): f.unlink() trainer_config["dataset"][s]["src"] = str(new_dir) - print("Done moving data to", str(new_dir)) + print("Done moving data to", str(new_dir), flush=True) return trainer_config @@ -98,7 +105,7 @@ def override_narval_paths(trainer_config): "with", path_overrides[task][split], ) - trainer_config["dataset"], _ = merge_dicts( + trainer_config["dataset"] = merge_dicts( trainer_config["dataset"], path_overrides[task][split] ) @@ -702,11 +709,11 @@ def load_config(config_str): assert "default" in task_conf assert split in task_conf - config, _ = merge_dicts({}, model_conf["default"]) - config, _ = merge_dicts(config, model_conf[task].get("default", {})) - config, _ = merge_dicts(config, model_conf[task][split]) - config, _ = merge_dicts(config, task_conf["default"]) - config, _ = merge_dicts(config, task_conf[split]) + config = merge_dicts({}, model_conf["default"]) + config = merge_dicts(config, model_conf[task].get("default", {})) + config = merge_dicts(config, model_conf[task][split]) + config = merge_dicts(config, task_conf["default"]) + config = merge_dicts(config, task_conf[split]) config["task"]["name"] = task config["task"]["split"] = split @@ -725,11 +732,9 @@ def build_config(args, args_override): # Check for overridden parameters. if args_override != []: overrides = create_dict_from_args(args_override) - config, _ = merge_dicts(config, overrides) + config = merge_dicts(config, overrides) - config, _ = merge_dicts( - config, {k: v for k, v in vars(args).items() if v is not None} - ) + config = merge_dicts(config, {k: v for k, v in vars(args).items() if v is not None}) config["data_split"] = args.config.split("-")[-1] config["run_dir"] = resolve(config["run_dir"]) config["slurm"] = {} @@ -760,7 +765,6 @@ def build_config(args, args_override): config = set_qm7x_target_stats(config) config = override_narval_paths(config) config = auto_note(config) - config = move_lmdb_data_to_slurm_tmpdir(config) if not config["no_cpus_to_workers"]: cpus = count_cpus() @@ -1105,7 +1109,7 @@ def get_pruned_edge_idx(edge_index, num_atoms=None, max_neigh=1e9): return _nonmax_idx -def merge_dicts(dict1: dict, dict2: dict): +def merge_dicts(dict1: dict, dict2: dict) -> dict: """Recursively merge two dictionaries. Values in dict2 override values in dict1. If dict1 and dict2 contain a dictionary as a value, this will call itself recursively to merge these dictionaries. @@ -1123,7 +1127,7 @@ def merge_dicts(dict1: dict, dict2: dict): Returns ------- - return_dict_and_duplicates: tuple(dict, list(str)) + return_dict: dict Merged dictionaries. """ if not isinstance(dict1, dict): @@ -1132,27 +1136,24 @@ def merge_dicts(dict1: dict, dict2: dict): raise ValueError(f"Expecting dict2 to be dict, found {type(dict2)}.") return_dict = copy.deepcopy(dict1) - duplicates = [] for k, v in dict2.items(): if k not in dict1: return_dict[k] = v else: if isinstance(v, dict) and isinstance(dict1[k], dict): - return_dict[k], duplicates_k = merge_dicts(dict1[k], dict2[k]) - duplicates += [f"{k}.{dup}" for dup in duplicates_k] + return_dict[k] = merge_dicts(dict1[k], dict2[k]) elif isinstance(v, list) and isinstance(dict1[k], list): if len(dict1[k]) != len(dict2[k]): raise ValueError( f"List for key {k} has different length in dict1 and dict2." + " Use an empty dict {} to pad for items in the shorter list." ) - return_dict[k] = [merge_dicts(d1, d2)[0] for d1, d2 in zip(dict1[k], v)] + return_dict[k] = [merge_dicts(d1, d2) for d1, d2 in zip(dict1[k], v)] else: return_dict[k] = dict2[k] - duplicates.append(k) - return return_dict, duplicates + return return_dict class SeverityLevelBetween(logging.Filter): @@ -1315,4 +1316,4 @@ def base_config(config, overrides={}): ], ) - return merge_dicts(conf, overrides)[0] + return merge_dicts(conf, overrides) diff --git a/ocpmodels/datasets/lmdb_dataset.py b/ocpmodels/datasets/lmdb_dataset.py index 0540741c70..2eaef01200 100644 --- a/ocpmodels/datasets/lmdb_dataset.py +++ b/ocpmodels/datasets/lmdb_dataset.py @@ -1,4 +1,4 @@ -""" +"""lmdb_dataset.py Copyright (c) Facebook, Inc. and its affiliates. This source code is licensed under the MIT license found in the @@ -52,9 +52,12 @@ def __init__(self, config, transform=None, fa_frames=None): self._keys, self.envs = [], [] for db_path in db_paths: self.envs.append(self.connect_db(db_path)) - length = pickle.loads( - self.envs[-1].begin().get("length".encode("ascii")) - ) + length = self.envs[-1].begin().get("length".encode("ascii")) + if length is not None: + length = pickle.loads(length) + else: + length = self.envs[-1].stat()["entries"] + assert length is not None, f"Could not find length of LMDB {db_path}" self._keys.append(list(range(length))) keylens = [len(k) for k in self._keys] diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index cf8cae1b64..dbd4106142 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -1,3 +1,5 @@ +"""scheduler.py +""" import inspect import torch.optim.lr_scheduler as lr_scheduler @@ -29,7 +31,10 @@ def __init__(self, optimizer, optim_config): self.scheduler_type = self.optim_config["scheduler"] else: self.scheduler_type = "LambdaLR" - scheduler_lambda_fn = lambda x: warmup_lr_lambda(x, self.optim_config) + + def scheduler_lambda_fn(x): + return warmup_lr_lambda(x, self.optim_config) + self.optim_config["lr_lambda"] = scheduler_lambda_fn if ( @@ -37,14 +42,14 @@ def __init__(self, optimizer, optim_config): and self.scheduler_type != "LinearWarmupCosineAnnealingLR" ): self.scheduler = getattr(lr_scheduler, self.scheduler_type) - scheduler_args = self.filter_kwargs(optim_config) + scheduler_args = self.filter_kwargs(self.optim_config) self.scheduler = self.scheduler(optimizer, **scheduler_args) elif self.scheduler_type == "WarmupCosineAnnealingLR": self.warmup_scheduler = warmup.ExponentialWarmup( - self.optimizer, warmup_period=optim_config["warmup_steps"] + self.optimizer, warmup_period=self.optim_config["warmup_steps"] ) self.scheduler = lr_scheduler.CosineAnnealingLR( - self.optimizer, T_max=optim_config["max_steps"], eta_min=1e-7 + self.optimizer, T_max=self.optim_config["max_steps"], eta_min=1e-7 ) def step(self, metrics=None, epoch=None): From da4310adb5c7ad8701d7be2fc0905423016b6bf6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 15:05:27 -0500 Subject: [PATCH 025/273] print wandb query --- launch_exp.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/launch_exp.py b/launch_exp.py index c2a16c76f5..9196bfb883 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -11,6 +11,15 @@ import copy +def util_strings(jobs, yaml_comments=False): + s = "All jobs launched: " + ", ".join(jobs) + s += "\nCancel experiment: scancel " + " ".join(jobs) + s += "\nWandB query for dashboard: (" + "|".join(jobs) + ")" + if yaml_comments: + s = "\n".join(["# " + line for line in s.splitlines()]) + return s + + def merge_dicts(dict1: dict, dict2: dict): """Recursively merge two dictionaries. Values in dict2 override values in dict1. If dict1 and dict2 contain a dictionary @@ -75,6 +84,7 @@ def write_exp_yaml_and_jobs(exp_file, outfile, jobs): if line.strip().startswith("- "): lines[run_line + i] = f"{line} # {jobs[j]}" j += 1 + lines += [""] + util_strings(jobs, True).splitlines() yml_out = outfile.with_suffix(".yaml") yml_out.write_text("\n".join(lines)) return yml_out @@ -194,7 +204,7 @@ def cli_arg(args, key=""): with outfile.open("w") as f: f.write(text) print(f"Output written to {str(outfile)}") - print("All job launched:", " ".join(jobs)) + print(util_strings(jobs)) yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs) print( "Experiment summary YAML in ", f"./{str(yml_out.relative_to(Path.cwd()))}" From c44ccf8a1a6c070eec824e39c30d780e15c2cb90 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Mon, 9 Jan 2023 18:22:11 -0500 Subject: [PATCH 026/273] update config files --- configs/exps/gnn/is2re_1gpu.yaml | 118 ++++++++++++++++++ configs/exps/gnn/test-gnn-all-1.yaml | 6 +- configs/exps/icml/baseline_s2ef.yaml | 22 ++-- .../exps/prop-check/symmetries_s2ef_2.yaml | 21 +++- configs/models/dpp.yaml | 2 +- configs/models/fanet.yaml | 23 +++- configs/models/sfarinet.yaml | 5 +- ocpmodels/models/sfarinet.py | 1 + ocpmodels/trainers/single_trainer.py | 4 +- scripts/gnn_dev.py | 6 +- 10 files changed, 177 insertions(+), 31 deletions(-) create mode 100644 configs/exps/gnn/is2re_1gpu.yaml diff --git a/configs/exps/gnn/is2re_1gpu.yaml b/configs/exps/gnn/is2re_1gpu.yaml new file mode 100644 index 0000000000..5aa2ecb141 --- /dev/null +++ b/configs/exps/gnn/is2re_1gpu.yaml @@ -0,0 +1,118 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 30:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + wandb_tags: 'is2re-archi-tests' + optim: + max_epochs: 5 + batch_size: 256 + eval_batch_size: 256 + +runs: + - config: schnet-is2re-all + note: 'Schnet' + - config: sfarinet-is2re-all + note: 'Sfarinet test' + frame_averaging: 2D + fa_fames: se3-random + - config: sfarinet-is2re-all + note: 'Smaller lr' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.0005 + - config: sfarinet-is2re-all + note: 'Sfarinet test smaller lr' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.003 + - config: sfarinet-is2re-all + note: 'Bigger size' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.001 + model: + hidden_channels: 500 + num_interactions: 4 + num_filters: 200 + num_gaussians: 200 + - config: sfarinet-is2re-all + note: 'Bigger size' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.0007 + model: + hidden_channels: 500 + num_interactions: 4 + num_filters: 200 + num_gaussians: 200 + - config: sfarinet-is2re-all + note: 'Bigger size' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.001 + model: + num_interactions: 6 + - config: sfarinet-is2re-all + note: 'Bigger size and smaller lr' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.0007 + model: + num_interactions: 6 + - config: sfarinet-is2re-all + note: 'Bigger size and change warmup steps' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.001 + lr_milestones: + - 20981 + - 26972 + - 35963 + warmup_steps: 10094 + model: + hidden_channels: 500 + num_interactions: 4 + num_filters: 200 + num_gaussians: 200 + - config: sfarinet-is2re-all + note: 'Much Bigger size' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.0007 + model: + hidden_channels: 800 + num_interactions: 4 + num_filters: 284 + num_gaussians: 284 + - config: sfarinet-is2re-all + note: 'Smaller size more interactions' + frame_averaging: 2D + fa_fames: se3-random + optim: + lr_initial: 0.001 + model: + hidden_channels: 128 + num_interactions: 6 + num_filters: 100 + num_gaussians: 100 + diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml index ea5dd8ec56..00a54ada07 100644 --- a/configs/exps/gnn/test-gnn-all-1.yaml +++ b/configs/exps/gnn/test-gnn-all-1.yaml @@ -17,7 +17,7 @@ default: energy_head: 'weighted-av-initial-embeds' # False ? wandb_tags: 'test-fanet' optim: - lr_initial: 0.0008 + lr_initial: 0.0005 runs: - config: fanet-is2re-all @@ -61,7 +61,7 @@ runs: mp_type: updownscale edge_embed_type: all optim: - lr_initial: 0.0007 + lr_initial: 0.0004 max_epochs: 25 - config: fanet-is2re-all @@ -75,7 +75,7 @@ runs: frame_averaging: 2D fa_fames: random optim: - lr_initial: 0.0005 + lr_initial: 0.0004 max_epochs: 25 - config: fanet-is2re-all diff --git a/configs/exps/icml/baseline_s2ef.yaml b/configs/exps/icml/baseline_s2ef.yaml index 71312e50b3..cefff0fb5e 100644 --- a/configs/exps/icml/baseline_s2ef.yaml +++ b/configs/exps/icml/baseline_s2ef.yaml @@ -1,25 +1,27 @@ job: mem: 48GB cpus: 4 - gres: gpu:rtx8000:1 + gres: gpu:rtx8000:2 partition: long time: 42:00:00 default: test_ri: True mode: train - wandb_tags: 'Baseline dpp 1 Gpus' + wandb_tags: 'baseline-schnet' runs: - - config: dpp-s2ef-2M - note: 'Baseline Schnet S2EF' - optim: - batch_size: 368 - eval_batch_size: 368 - config: schnet-s2ef-2M - note: 'Baseline Schnet S2EF' + note: 'Baseline Schnet S2EF 2 GPU' + optim: + max_epochs: 15 + force_coefficient: 50 + batch_size: 96 + eval_batch_size: 96 + - config: schnet-is2re-2M + note: 'Baseline Schnet IS2RE 2 GPU' optim: max_epochs: 15 force_coefficient: 50 - batch_size: 192 - eval_batch_size: 192 + batch_size: 128 + eval_batch_size: 128 diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml index 9abfc02b40..aebe1b7934 100644 --- a/configs/exps/prop-check/symmetries_s2ef_2.yaml +++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml @@ -1,9 +1,9 @@ job: mem: 48GB cpus: 4 - gres: gpu:rtx8000:4 + gres: gpu:rtx8000:1 partition: long - time: 40:00:00 + time: 20:00:00 default: test_ri: True @@ -16,17 +16,19 @@ default: energy_head: False # False ? optim: max_epochs: 5 - wandb_tags: 'prop-check-ICLM' + batch_size: 196 + eval_batch_size: 196 + wandb_tags: 's2ef-sym-prop' runs: - config: sfarinet-s2ef-2M - note: 'Baseline 5 epochs' + note: 'Baseline 5 epochs 1 Gpu' frame_averaging: 2D fa_frames: all model: regress_forces: from_energy - config: sfarinet-s2ef-2M - note: 'Baseline 5 epochs' + note: 'Baseline 5 epochs 1 Gpu' frame_averaging: 3D fa_frames: all model: @@ -75,3 +77,12 @@ runs: energy_grad_coefficient: 100 force_coefficient: 30 energy_coefficient: 0 + - config: sfarinet-s2ef-2M + note: 'Large force coef' + frame_averaging: 2D + fa_frames: random + model: + regress_forces: direct_with_gradient_target + optim: + force_coefficient: 75 + energy_coefficient: 1 \ No newline at end of file diff --git a/configs/models/dpp.yaml b/configs/models/dpp.yaml index 4b973595f4..6a289bbc22 100644 --- a/configs/models/dpp.yaml +++ b/configs/models/dpp.yaml @@ -119,7 +119,7 @@ s2ef: - 41666 warmup_steps: 10416 warmup_factor: 0.2 - max_epochs: 5 + max_epochs: 15 force_coefficient: 50 model: hidden_channels: 192 diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml index 9840d6432f..5bf8ad3546 100644 --- a/configs/models/fanet.yaml +++ b/configs/models/fanet.yaml @@ -85,6 +85,8 @@ is2re: # ----- S2EF ----- # ------------------ +# For 4 GPUs + s2ef: default: model: @@ -101,17 +103,26 @@ s2ef: warmup_steps: 25000 warmup_factor: 0.2 lr_gamma: 0.1 - lr_initial: 0.0002 - max_epochs: 20 + lr_initial: 0.0001 + max_epochs: 15 warmup_steps: 20000 lr_milestones: - - 50000 - - 70000 - - 90000 + - 55000 + - 75000 + - 10000 200k: {} - 2M: {} + # 2 gpus + 2M: + model: + num_interactions: 5 + hidden_channels: 1024 + num_gaussians: 200 + num_filters: 256 + optim: + batch_size: 96 + eval_batch_size: 96 20M: {} diff --git a/configs/models/sfarinet.yaml b/configs/models/sfarinet.yaml index 0a1b1ed922..57bc1afdec 100644 --- a/configs/models/sfarinet.yaml +++ b/configs/models/sfarinet.yaml @@ -74,7 +74,8 @@ is2re: model: hidden_channels: 384 num_interactions: 4 - + num_filters: 128 + num_gaussians: 100 optim: lr_initial: 0.001 lr_milestones: @@ -88,6 +89,8 @@ is2re: # ----- S2EF ----- # ------------------ +# For 4 GPUs + s2ef: default: model: diff --git a/ocpmodels/models/sfarinet.py b/ocpmodels/models/sfarinet.py index bdbd89cfd3..f55fc414aa 100644 --- a/ocpmodels/models/sfarinet.py +++ b/ocpmodels/models/sfarinet.py @@ -273,6 +273,7 @@ class SfariNet(BaseModel): def __init__(self, **kwargs): super().__init__() + torch.autograd.set_detect_anomaly(True) self.cutoff = kwargs["cutoff"] self.use_pbc = kwargs["use_pbc"] self.max_num_neighbors = kwargs["max_num_neighbors"] diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 3e8529f67e..3e1983d951 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -227,7 +227,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): self.epoch = epoch_int + (i + 1) / n_train self.step = epoch_int * n_train + i + 1 - # Get a batch. + # Get a batch with loader_times.next("get_batch"): batch = next(train_loader_iter) @@ -357,7 +357,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): log_epoch_times = True self.model_forward(batch) self.logger.log({"Batch time": time.time() - start_time}) - self.logger.log({"Model run time": model_run_time / len(self.train_loader)}) + self.logger.log({"Model run time": model_run_time / n_train}) if log_epoch_times: self.logger.log({"Epoch time": sum(epoch_times) / len(epoch_times)}) diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py index b92d21578e..d2e114c655 100644 --- a/scripts/gnn_dev.py +++ b/scripts/gnn_dev.py @@ -21,7 +21,7 @@ config["frame_averaging"] = "2D" config["fa_frames"] = "random" # "random" config["test_ri"] = True - config["optim"] = {"max_epochs": 1} + config["optim"] = {"max_epochs": 0} config["model"] = {"use_pbc": True} config["model"]["edge_embed_type"] = "rij" config["model"]["mp_type"] = "base" @@ -32,8 +32,8 @@ str_args = sys.argv[1:] if all("config" not in arg for arg in str_args): str_args.append("--is_debug") - str_args.append("--config=fanet-is2re-10k") - # str_args.append("--config=sfarinet-s2ef-2M") + # str_args.append("--config=fanet-is2re-10k") + str_args.append("--config=sfarinet-s2ef-2M") warnings.warn( "No model / mode is given; chosen as default" + f"Using: {str_args[-1]}" ) From 49fb8dc6ca37544776fc8dab9190638c08693440 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 23:51:52 -0500 Subject: [PATCH 027/273] early stopping & spooky params --- configs/exps/qm7x/schnet-from-spooky.yaml | 51 ++++++++++++++++ ocpmodels/modules/scheduler.py | 71 ++++++++++++++++++++++- ocpmodels/trainers/base_trainer.py | 3 +- ocpmodels/trainers/single_trainer.py | 29 +++++---- 4 files changed, 142 insertions(+), 12 deletions(-) create mode 100644 configs/exps/qm7x/schnet-from-spooky.yaml diff --git a/configs/exps/qm7x/schnet-from-spooky.yaml b/configs/exps/qm7x/schnet-from-spooky.yaml new file mode 100644 index 0000000000..6f2164c4d6 --- /dev/null +++ b/configs/exps/qm7x/schnet-from-spooky.yaml @@ -0,0 +1,51 @@ +# trainset has 4068193 samples +job: + mem: 32GB + cpus: 8 + gres: gpu:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 + env: ocp-a100 + +default: + config: schnet-qm7x-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x + phys_hidden_channels: 0 + phys_embeds: False + energy_head: False + pg_hidden_channels: 0 + tag_hidden_channels: 0 + frame_averaging: "" + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions + optim: batch_size, lr_initial + optim: + batch_size: 10 + warmup_steps: 1000 + lr_initial: 0.0001 + # parameters EMA + # ema_decay: 0.999 + decay_steps: 750000 + scheduler: + decay_rate: 0.01 + max_steps: 1000000 + model: + hidden_channels: 128 + num_filters: 128 + num_gaussians: 20 + num_interactions: 6 + cutoff: 5.0 + +runs: + - optim: + ema_decay: 0.999 + - optim: + scheduler: LinearWarmupCosineAnnealingLR + - optim: + ema_decay: 0.999 + scheduler: LinearWarmupCosineAnnealingLR diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index dbd4106142..0d993b3925 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -1,7 +1,6 @@ """scheduler.py """ import inspect - import torch.optim.lr_scheduler as lr_scheduler from ocpmodels.common.utils import warmup_lr_lambda @@ -83,3 +82,73 @@ def filter_kwargs(self, optim_config): def get_lr(self): for group in self.optimizer.param_groups: return group["lr"] + + +class EarlyStopper: + """ + Class that stores the current best metric score and monitors whether + it's improving or not. If it does not decrease for a certain number + of validation calls (with some minimal improvement) then it tells the trainer + to stop. + """ + + def __init__( + self, patience=7, mode="min", min_abs_change=1e-5, store_all_steps=True + ): + self.patience = patience + self.mode = mode + self.counter = 0 + self.min_abs_change = min_abs_change + self.store_all_steps = store_all_steps + self.metrics = [] + + if self.mode == "min": + self.best_score = float("inf") + elif self.mode == "max": + self.best_score = float("-inf") + else: + raise ValueError("mode must be either min or max") + + self.early_stop = False + + def should_stop(self, metric): + """ + Returns True if the metric has not improved for a certain number of + steps. False otherwise. Stores the metric in `self.metrics`: all the steps if + `self.store_all_steps` is `True`, otherwise only the last `n=self.patience`. + + Args: + metric (Number): Metric to track. + + Returns: + bool: Wether to stop training or not + """ + metric = float(metric) + self.metrics.append(metric) + if not self.store_all_steps: + self.metrics = self.metrics[-self.patience :] + + if self.mode == "min": + if metric < self.best_score - self.min_abs_change: + self.best_score = metric + self.counter = 0 + else: + self.counter += 1 + elif self.mode == "max": + if metric > self.best_score + self.min_abs_change: + self.best_score = metric + self.counter = 0 + else: + self.counter += 1 + + if self.counter >= self.patience: + self.early_stop = True + + return self.early_stop + + @property + def reason(self): + return ( + f"Early stopping after {self.counter} steps with no improvement:\n" + + " -> ".join([f"{m:.6f}" for m in self.metrics[-self.patience :]]) + ) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index b56c910eb2..4804526f11 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -44,7 +44,7 @@ ) from ocpmodels.modules.loss import DDPLoss, L2MAELoss from ocpmodels.modules.normalizer import Normalizer -from ocpmodels.modules.scheduler import LRScheduler +from ocpmodels.modules.scheduler import LRScheduler, EarlyStopper @registry.register_trainer("base") @@ -79,6 +79,7 @@ def __init__(self, **kwargs): self.datasets = {} self.samplers = {} self.loaders = {} + self.early_stopper = EarlyStopper(patience=10, min_abs_change=1e-5) if torch.cuda.is_available() and not self.cpu: self.device = torch.device(f"cuda:{self.config['local_rank']}") diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 3e1983d951..7dd1b0aec4 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -297,7 +297,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): # Evaluate on val set after every `eval_every` iterations. if should_validate: self.save( - checkpoint_file=f"checkpoint-{str(self.step).zfill(6)}.pt", + checkpoint_file=f"checkpoint-{str(self.step).zfill(7)}.pt", training_state=True, ) @@ -307,10 +307,13 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): debug_batches=debug_batches, is_first=first_eval, ) + first_eval = False if val_metrics == "SIGTERM": return "SIGTERM" + current_val_metric = val_metrics[primary_metric]["metric"] + if current_val_metric < self.best_val_metric: self.best_val_metric = current_val_metric self.save( @@ -318,6 +321,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): checkpoint_file="best_checkpoint.pt", training_state=False, ) + if self.early_stopper.should_stop(current_val_metric): + print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n") + if self.logger: + self.logger.add_tags(["E-S"]) + return self.end_of_training() + self.model.train() self.scheduler_step(eval_every, current_val_metric) @@ -334,9 +343,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): torch.cuda.empty_cache() # End of training. + if not is_test_env: + return self.end_of_training() - if is_test_env: - return + def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times): eas = self.eval_all_splits(True, epoch=epoch_int, debug_batches=debug_batches) if eas == "SIGTERM": @@ -349,17 +359,16 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): # Time model if self.logger is not None: - log_epoch_times = False + log_epoch_times = self.config["optim"]["max_epochs"] > 0 start_time = time.time() - if self.config["optim"]["max_epochs"] == 0: - batch = next(iter(self.loaders["train"])) - else: - log_epoch_times = True + + # deterministic batch because shuffle=False for validation + batch = next(iter(self.loaders[self.config["dataset"]["default_val"]])) self.model_forward(batch) self.logger.log({"Batch time": time.time() - start_time}) self.logger.log({"Model run time": model_run_time / n_train}) if log_epoch_times: - self.logger.log({"Epoch time": sum(epoch_times) / len(epoch_times)}) + self.logger.log({"Epoch time": np.mean(epoch_times)}) # Check respect of symmetries if self.test_ri and not is_test_env: @@ -674,7 +683,7 @@ def test_model_symmetries(self, debug_batches=-1): reflected = self.reflect_graph(batch) preds3 = self.model_forward(reflected["batch_list"]) energy_diff_refl += torch.abs(preds1["energy"] - preds3["energy"]).sum() - if self.task_name == "s2ef": + if self.task_name == "s2ef": forces_diff_refl += torch.abs( preds1["forces"] @ reflected["rot"].to(preds1["forces"].device) - preds3["forces"] From 7798d2a561aeffbe8949c74c307290ca08433398 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 7 Jan 2023 19:06:40 -0500 Subject: [PATCH 028/273] fix line length --- sbatch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sbatch.py b/sbatch.py index 6edb23ca03..bb0ff8ab36 100644 --- a/sbatch.py +++ b/sbatch.py @@ -232,7 +232,9 @@ def add_jobid_to_log(j, command_line, exp_name=None): sbatch_py_vars["num-nodes"] = args.nodes sbatch_py_vars["num-gpus"] = args.ntasks_per_node else: - args.py_args += f" --distributed --num-nodes {args.nodes} --num-gpus {args.ntasks_per_node}" + args.py_args += " --distributed --num-nodes {} --num-gpus {}".format( + args.nodes, args.ntasks_per_node + ) # add logdir to main.py's command-line arguments if "--logdir" not in args.py_args and args.logdir: From 2ea23b82b61f51c4b08bc6123918b9639146f4b6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 7 Jan 2023 19:06:49 -0500 Subject: [PATCH 029/273] add `broadcast_object_list` --- ocpmodels/common/distutils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ocpmodels/common/distutils.py b/ocpmodels/common/distutils.py index 32d30ebea3..7057d9695b 100644 --- a/ocpmodels/common/distutils.py +++ b/ocpmodels/common/distutils.py @@ -95,6 +95,12 @@ def broadcast(tensor, src, group=dist.group.WORLD, async_op=False): dist.broadcast(tensor, src, group, async_op) +def broadcast_object_list(obj_list, src=0): + if get_world_size() == 1: + return + dist.broadcast_object_list(obj_list, src=src) + + def all_reduce(data, group=dist.group.WORLD, average=False, device=None): if get_world_size() == 1: return data From 08bd4cc7cc7bb5034dd9a0109e945faacb96bf53 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 7 Jan 2023 19:07:03 -0500 Subject: [PATCH 030/273] add `orion` flags --- ocpmodels/common/flags.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index f487115b31..354b52ee75 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -240,6 +240,20 @@ def add_core_args(self): default=100, help="Log training loss every n steps", ) + self.parser.add_argument( + "--orion_search", + "-o", + type=str, + help="Path to an orion search space yaml file", + ) + self.parser.add_argument( + "--unique_exp_name", + "-u", + type=str, + help="Name for this experiment. If the experiment name already exists," + + " the search space MUST be the same. If it is not, the job will crash." + + " If you change the search space, you must change the experiment name.", + ) flags = Flags() From 6a388ea6eb671c0d5aa3bcb7467e3fd34d193801 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 7 Jan 2023 19:07:11 -0500 Subject: [PATCH 031/273] declare `objective` --- ocpmodels/trainers/base_trainer.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 4804526f11..8b5fa7e077 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -783,14 +783,10 @@ def eval_all_splits( overall_energy_mae = cumulated_energy_mae / len(all_splits) self.logger.log({"Eval time": cumulated_time}) self.logger.log({"Overall MAE": overall_energy_mae}) + self.objective = overall_energy_mae if self.config["model"].get("regress_forces", False): overall_forces_mae = cumulated_forces_mae / len(all_splits) self.logger.log({"Overall Forces MAE": overall_forces_mae}) - if self.logger.ntfy: - self.logger.ntfy( - message=f"{JOB_ID} - Overall MAE: {overall_energy_mae}", - click=self.logger.url, - ) # Run on test split if final and "test" in self.config["dataset"] and self.eval_on_test: @@ -935,3 +931,11 @@ def handle_sigterm(self, signum, _): if signum == 15 and not self.sigterm: print("\nHandling SIGTERM signal received.\n") self.sigterm = True + + def close_datasets(self): + try: + for ds in self.datasets.values(): + if hasattr(ds, "close_db") and callable(ds.close_db): + ds.close_db() + except Exception as e: + print("Error closing datasets: ", str(e)) From 527a39e72f05b1bd1ad6f98b916307071288250f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 8 Jan 2023 01:14:01 -0500 Subject: [PATCH 032/273] fix `broadcast_object_list`device --- ocpmodels/common/distutils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ocpmodels/common/distutils.py b/ocpmodels/common/distutils.py index 7057d9695b..d4f4c13894 100644 --- a/ocpmodels/common/distutils.py +++ b/ocpmodels/common/distutils.py @@ -98,7 +98,12 @@ def broadcast(tensor, src, group=dist.group.WORLD, async_op=False): def broadcast_object_list(obj_list, src=0): if get_world_size() == 1: return - dist.broadcast_object_list(obj_list, src=src) + dist.broadcast_object_list( + obj_list, + src=src, + group=dist.group.WORLD, + device=torch.device(f"cuda:{get_rank()}"), + ) def all_reduce(data, group=dist.group.WORLD, average=False, device=None): From 6303248583ee9725f35ce2aa0898a3b319789476 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 8 Jan 2023 01:14:38 -0500 Subject: [PATCH 033/273] refactor to `Runner` and v0 for Orion --- main.py | 111 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 41 deletions(-) diff --git a/main.py b/main.py index 1d6a2b842d..29e0e1221e 100644 --- a/main.py +++ b/main.py @@ -16,6 +16,8 @@ from pathlib import Path import torch +from orion.client import build_experiment +from yaml import safe_load from ocpmodels.common import distutils from ocpmodels.common.flags import flags @@ -23,6 +25,7 @@ from ocpmodels.common.utils import ( JOB_ID, build_config, + merge_dicts, resolve, setup_imports, setup_logging, @@ -133,8 +136,53 @@ def print_warnings(): print("-" * 80 + "\n") +class Runner: + def __init__(self, trainer_config): + self.trainer_config = trainer_config + self.trainer = None + + def run(self, **hparams): + self.original_config = copy.deepcopy(self.trainer_config) + self.hparams = hparams + + should_be_0 = distutils.get_rank() + hp_list = [hparams, should_be_0] + distutils.broadcast_object_list(hp_list) + hparams, should_be_0 = hp_list + print("hparams: ", hparams) + print("should_be_0: ", should_be_0) + assert should_be_0 == 0 + if hparams: + print("Received hyper-parameters from Orion:") + print(hparams) + + self.trainer_config = merge_dicts(self.trainer_config, hparams) + cls = registry.get_trainer_class(self.trainer_config["trainer"]) + self.trainer: BaseTrainer = cls(**self.trainer_config) + task = registry.get_task_class(self.trainer_config["mode"])(self.trainer_config) + task.setup(self.trainer) + start_time = time.time() + print_warnings() + + signal = task.run() + + # handle job preemption / time limit + if signal == "SIGTERM": + print("\nJob was preempted. Wrapping up...\n") + self.trainer.close_datasets() + + distutils.synchronize() + logging.info(f"Total time taken: {time.time() - start_time}") + if self.trainer.logger is not None: + self.trainer.logger.log({"Total time": time.time() - start_time}) + + return [ + {"name": "energy_mae", "type": "objective", "value": self.trainer.objective} + ] + + if __name__ == "__main__": - ntfy = trainer = error = signal = None + runner = error = signal = None setup_logging() @@ -166,48 +214,29 @@ def print_warnings(): setup_imports() trainer_config = should_continue(trainer_config) trainer_config = read_slurm_env(trainer_config) + runner = Runner(trainer_config) # ------------------- # ----- Train ----- # ------------------- - trainer: BaseTrainer = registry.get_trainer_class(trainer_config["trainer"])( - **trainer_config - ) - task = registry.get_task_class(trainer_config["mode"])(trainer_config) - task.setup(trainer) - start_time = time.time() - if trainer.logger is not None: - message = f"{JOB_ID} - Training started 🚀" - if trainer_config.get("note"): - message += f" - {trainer_config.get('note')}" - if trainer_config.get("wandb_tags"): - message += f" - {trainer_config.get('wandb_tags')}" - trainer.logger.ntfy(message, click=trainer.logger.url) - print_warnings() - - signal = task.run() - - # handle job preemption / time limit - if signal == "SIGTERM": - print("\nJob was preempted. Wrapping up...\n") - for ds in trainer.datasets.values(): - if hasattr(ds, "close_db") and callable(ds.close_db): - ds.close_db() - - # ----------------- - # ----- End ----- - # ----------------- - distutils.synchronize() - logging.info(f"Total time taken: {time.time() - start_time}") - if trainer.logger is not None: - trainer.logger.log({"Total time": time.time() - start_time}) - - except Exception as e: - if trainer and trainer.logger: - e_name = e.__class__.__name__ - trainer.logger.ntfy( - f"{JOB_ID} - Training failed 😭" + f"{e_name} - {str(e)}", - click=trainer.logger.url or None, + if args.orion_search and distutils.is_master(): + assert args.unique_exp_name + space = safe_load(Path(args.orion_search).read_text()) + print("Search Space: ", space) + experiment = build_experiment( + name=args.unique_exp_name, + space=space, + algorithms={"mofa": {"seed": 123}}, + ) + experiment.workon( + runner.run, + max_trials_per_worker=1, + n_workers=1, + idle_timeout=3600 * 24 * 4, ) + else: + runner.run() + + except Exception: error = True print(traceback.format_exc()) @@ -220,8 +249,8 @@ def print_warnings(): distutils.cleanup() print("Done!") - if trainer and trainer.logger: - trainer.logger.finish(error or signal) + if runner and runner.trainer and runner.trainer.logger: + runner.trainer.logger.finish(error or signal) if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read(): print("Self-canceling SLURM job", JOB_ID) From a249efc119ba462b579861b01499595223ac44db Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 14:14:29 -0500 Subject: [PATCH 034/273] debug print `i_for_epoch` --- ocpmodels/trainers/single_trainer.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 7dd1b0aec4..bb14e1636d 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -5,6 +5,7 @@ LICENSE file in the root directory of this source tree. """ +import datetime import logging import os import time @@ -41,6 +42,10 @@ class SingleTrainer(BaseTrainer): can be found in `configs/ocp_is2re `_. # noqa: E501 """ + @property + def now(self): + return str(datetime.datetime.now()).split(".")[0] + def load_task(self): if not self.silent: logging.info(f"Loading dataset: {self.config['task']['dataset']}") @@ -200,7 +205,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): # Calculate start_epoch from step instead of loading the epoch number # to prevent inconsistencies due to different batch size in checkpoint. start_epoch = self.step // n_train - loader_times = Times() + timer = Times() epoch_times = [] model_run_time = 0 @@ -224,11 +229,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): if self.sigterm: return "SIGTERM" i_for_epoch += 1 + print(self.now, "i_for_epoch: ", i_for_epoch, flush=True) self.epoch = epoch_int + (i + 1) / n_train self.step = epoch_int * n_train + i + 1 - # Get a batch - with loader_times.next("get_batch"): + # Get a batch. + with timer.next("get_batch"): batch = next(train_loader_iter) # Forward, loss, backward. @@ -272,10 +278,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): ) # Log metrics. - gbm, gbs = loader_times.prepare_for_logging() + gbm, gbs = timer.prepare_for_logging() self.metrics["get_batch_time_mean"] = {"metric": gbm["get_batch"]} self.metrics["get_batch_time_std"] = {"metric": gbs["get_batch"]} - loader_times.reset() + timer.reset() # logging.info(f"Step: {self.step}") self.log_train_metrics() From cde8e1ae18a0cde999f23d0866fb5372702639ff Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 14:15:09 -0500 Subject: [PATCH 035/273] add debug prints --- main.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/main.py b/main.py index 29e0e1221e..90282124ac 100644 --- a/main.py +++ b/main.py @@ -30,6 +30,7 @@ setup_imports, setup_logging, update_from_sbatch_py_vars, + move_lmdb_data_to_slurm_tmpdir, ) from ocpmodels.trainers import BaseTrainer @@ -147,10 +148,10 @@ def run(self, **hparams): should_be_0 = distutils.get_rank() hp_list = [hparams, should_be_0] + # print("hparams pre-broadcast: ", hparams) distutils.broadcast_object_list(hp_list) hparams, should_be_0 = hp_list - print("hparams: ", hparams) - print("should_be_0: ", should_be_0) + # print("hparams post-broadcast: ", hparams) assert should_be_0 == 0 if hparams: print("Received hyper-parameters from Orion:") @@ -176,9 +177,14 @@ def run(self, **hparams): if self.trainer.logger is not None: self.trainer.logger.log({"Total time": time.time() - start_time}) - return [ - {"name": "energy_mae", "type": "objective", "value": self.trainer.objective} - ] + objective = self.trainer.objective + # print("objective pre-broadcast: ", objective) + o_list = [objective] + distutils.broadcast_object_list(o_list) + objective = o_list[0] + # print("objective post-broadcast: ", objective) + + return [{"name": "energy_mae", "type": "objective", "value": objective}] if __name__ == "__main__": @@ -206,15 +212,22 @@ def run(self, **hparams): if args.distributed: distutils.setup(trainer_config) + print("Distributed backend setup.") + + if distutils.is_master(): + trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config) + # distutils.synchronize() try: # ------------------- # ----- Setup ----- # ------------------- setup_imports() + print("All things imported.") trainer_config = should_continue(trainer_config) trainer_config = read_slurm_env(trainer_config) runner = Runner(trainer_config) + print("Runner ready.") # ------------------- # ----- Train ----- # ------------------- @@ -234,6 +247,7 @@ def run(self, **hparams): idle_timeout=3600 * 24 * 4, ) else: + print("Starting runner.") runner.run() except Exception: @@ -243,7 +257,7 @@ def run(self, **hparams): finally: if args.distributed: print( - "Waiting for all processes to finish with distutils.cleanup()...", + "\nWaiting for all processes to finish with distutils.cleanup()...", end="", ) distutils.cleanup() @@ -253,5 +267,5 @@ def run(self, **hparams): runner.trainer.logger.finish(error or signal) if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read(): - print("Self-canceling SLURM job", JOB_ID) + print("\nSelf-canceling SLURM job", JOB_ID) os.system(f"scancel {JOB_ID}") From 48b83796313f8c4b29bac795bc398c934813776f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 14:16:23 -0500 Subject: [PATCH 036/273] read from scratch --- configs/models/tasks/is2re.yaml | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/configs/models/tasks/is2re.yaml b/configs/models/tasks/is2re.yaml index fe7ed92187..059ef62c53 100644 --- a/configs/models/tasks/is2re.yaml +++ b/configs/models/tasks/is2re.yaml @@ -18,30 +18,27 @@ default: dataset: default_val: val_id train: - src: /network/projects/_groups/ocp/oc20/is2re/all/train/data.lmdb + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/train/ normalize_labels: True target_mean: -1.525913953781128 target_std: 2.279365062713623 val_id: - src: /network/projects/_groups/ocp/oc20/is2re/all/val_id/data.lmdb + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_id/ val_ood_cat: - src: /network/projects/_groups/ocp/oc20/is2re/all/val_ood_cat/data.lmdb + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_cat/ val_ood_ads: - src: /network/projects/_groups/ocp/oc20/is2re/all/val_ood_ads/data.lmdb + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_ads/ val_ood_both: - src: /network/projects/_groups/ocp/oc20/is2re/all/val_ood_both/data.lmdb + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_both/ 10k: dataset: train: - src: /network/projects/_groups/ocp/oc20/is2re/10k/train/data.lmdb # data/is2re/10k/train/data.lmdb + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/10k/train 100k: dataset: train: - src: /network/projects/_groups/ocp/oc20/is2re/100k/train/data.lmdb + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/100k/train -all: - dataset: - train: - src: /network/projects/_groups/ocp/oc20/is2re/all/train/data.lmdb +all: {} \ No newline at end of file From 253a067cace42bf1d0bf3c81d41500d71a6758dd Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 14:16:59 -0500 Subject: [PATCH 037/273] initial None objective --- ocpmodels/trainers/base_trainer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 8b5fa7e077..4a0216c074 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -66,6 +66,7 @@ def __init__(self, **kwargs): } self.sigterm = False + self.objective = None self.epoch = 0 self.step = 0 self.cpu = self.config["cpu"] From f8d70d1ba0605633be8e16a2dd4d0419a47a1938 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Mon, 9 Jan 2023 09:32:34 -0500 Subject: [PATCH 038/273] Force MAE + many config files --- configs/exps/gnn/test-gnn-all-1.yaml | 6 +++--- configs/exps/icml/baseline_s2ef.yaml | 20 ++++++++---------- .../exps/prop-check/symmetries_s2ef_2.yaml | 21 +++++-------------- configs/models/dpp.yaml | 2 +- ocpmodels/trainers/base_trainer.py | 3 ++- ocpmodels/trainers/single_trainer.py | 6 ++++-- 6 files changed, 24 insertions(+), 34 deletions(-) diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml index 00a54ada07..ea5dd8ec56 100644 --- a/configs/exps/gnn/test-gnn-all-1.yaml +++ b/configs/exps/gnn/test-gnn-all-1.yaml @@ -17,7 +17,7 @@ default: energy_head: 'weighted-av-initial-embeds' # False ? wandb_tags: 'test-fanet' optim: - lr_initial: 0.0005 + lr_initial: 0.0008 runs: - config: fanet-is2re-all @@ -61,7 +61,7 @@ runs: mp_type: updownscale edge_embed_type: all optim: - lr_initial: 0.0004 + lr_initial: 0.0007 max_epochs: 25 - config: fanet-is2re-all @@ -75,7 +75,7 @@ runs: frame_averaging: 2D fa_fames: random optim: - lr_initial: 0.0004 + lr_initial: 0.0005 max_epochs: 25 - config: fanet-is2re-all diff --git a/configs/exps/icml/baseline_s2ef.yaml b/configs/exps/icml/baseline_s2ef.yaml index cefff0fb5e..154a4c07a5 100644 --- a/configs/exps/icml/baseline_s2ef.yaml +++ b/configs/exps/icml/baseline_s2ef.yaml @@ -1,27 +1,25 @@ job: mem: 48GB cpus: 4 - gres: gpu:rtx8000:2 + gres: gpu:rtx8000:1 partition: long time: 42:00:00 default: test_ri: True mode: train - wandb_tags: 'baseline-schnet' + wandb_tags: 'Baseline dpp 1 Gpus' runs: + - config: dpp-s2ef-2M + note: 'Baseline Schnet S2EF' + optim: + batch_size: 368 + eval_batch_size: 368 - config: schnet-s2ef-2M note: 'Baseline Schnet S2EF 2 GPU' optim: max_epochs: 15 force_coefficient: 50 - batch_size: 96 - eval_batch_size: 96 - - config: schnet-is2re-2M - note: 'Baseline Schnet IS2RE 2 GPU' - optim: - max_epochs: 15 - force_coefficient: 50 - batch_size: 128 - eval_batch_size: 128 + batch_size: 192 + eval_batch_size: 192 diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml index aebe1b7934..9abfc02b40 100644 --- a/configs/exps/prop-check/symmetries_s2ef_2.yaml +++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml @@ -1,9 +1,9 @@ job: mem: 48GB cpus: 4 - gres: gpu:rtx8000:1 + gres: gpu:rtx8000:4 partition: long - time: 20:00:00 + time: 40:00:00 default: test_ri: True @@ -16,19 +16,17 @@ default: energy_head: False # False ? optim: max_epochs: 5 - batch_size: 196 - eval_batch_size: 196 - wandb_tags: 's2ef-sym-prop' + wandb_tags: 'prop-check-ICLM' runs: - config: sfarinet-s2ef-2M - note: 'Baseline 5 epochs 1 Gpu' + note: 'Baseline 5 epochs' frame_averaging: 2D fa_frames: all model: regress_forces: from_energy - config: sfarinet-s2ef-2M - note: 'Baseline 5 epochs 1 Gpu' + note: 'Baseline 5 epochs' frame_averaging: 3D fa_frames: all model: @@ -77,12 +75,3 @@ runs: energy_grad_coefficient: 100 force_coefficient: 30 energy_coefficient: 0 - - config: sfarinet-s2ef-2M - note: 'Large force coef' - frame_averaging: 2D - fa_frames: random - model: - regress_forces: direct_with_gradient_target - optim: - force_coefficient: 75 - energy_coefficient: 1 \ No newline at end of file diff --git a/configs/models/dpp.yaml b/configs/models/dpp.yaml index 6a289bbc22..4b973595f4 100644 --- a/configs/models/dpp.yaml +++ b/configs/models/dpp.yaml @@ -119,7 +119,7 @@ s2ef: - 41666 warmup_steps: 10416 warmup_factor: 0.2 - max_epochs: 15 + max_epochs: 5 force_coefficient: 50 model: hidden_channels: 192 diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 4a0216c074..72bfea076d 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -783,8 +783,9 @@ def eval_all_splits( if final and self.config["logger"] == "wandb" and distutils.is_master(): overall_energy_mae = cumulated_energy_mae / len(all_splits) self.logger.log({"Eval time": cumulated_time}) - self.logger.log({"Overall MAE": overall_energy_mae}) self.objective = overall_energy_mae + self.logger.log({"Eval time": cumulated_time}) + self.logger.log({"Overall MAE": overall_energy_mae}) if self.config["model"].get("regress_forces", False): overall_forces_mae = cumulated_forces_mae / len(all_splits) self.logger.log({"Overall Forces MAE": overall_forces_mae}) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index bb14e1636d..e7e062ebb3 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -372,7 +372,9 @@ def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times) batch = next(iter(self.loaders[self.config["dataset"]["default_val"]])) self.model_forward(batch) self.logger.log({"Batch time": time.time() - start_time}) - self.logger.log({"Model run time": model_run_time / n_train}) + self.logger.log( + {"Model run time": model_run_time / len(self.loaders["train"])} + ) if log_epoch_times: self.logger.log({"Epoch time": np.mean(epoch_times)}) @@ -697,7 +699,7 @@ def test_model_symmetries(self, debug_batches=-1): # assert torch.allclose( # torch.abs( # batch[0].force @ reflected["rot"].to(batch[0].force.device) - # - reflected["batch_list"][0].force # .to(batch[0].force.device) + # - reflected["batch_list"][0].force #.to(batch[0].force.device) # ).sum(), # torch.tensor([0.0]), # .to(batch[0].force.device) # atol=1e-05, From a8659db81ce7d4221cef8aa870e447aab0cc05a3 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 18:15:09 -0500 Subject: [PATCH 039/273] resume from orion --- main.py | 102 +++++------------------------ ocpmodels/common/utils.py | 133 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+), 87 deletions(-) diff --git a/main.py b/main.py index 90282124ac..da04d5cd93 100644 --- a/main.py +++ b/main.py @@ -25,12 +25,15 @@ from ocpmodels.common.utils import ( JOB_ID, build_config, + continue_from_slurm_job_id, + continue_orion_exp, merge_dicts, + move_lmdb_data_to_slurm_tmpdir, + read_slurm_env, resolve, setup_imports, setup_logging, update_from_sbatch_py_vars, - move_lmdb_data_to_slurm_tmpdir, ) from ocpmodels.trainers import BaseTrainer @@ -48,81 +51,6 @@ ) -def read_slurm_env(config): - """ - Parses the output of `scontrol show` in order to store the slurm - config (mem, cpu, node, gres) as a `"slurm"` key in the `config` object. - - Args: - config (dict): Run configuration - - Returns: - dict: Updated run config if no "slurm" key exists or it's empty - """ - if not config.get("slurm"): - return config - - command = f"scontrol show job {JOB_ID}" - scontrol = subprocess.check_output(command.split(" ")).decode("utf-8").strip() - params = re.findall(r"TRES=(.+)\n", scontrol) - try: - if params: - params = params[0] - for kv in params.split(","): - k, v = kv.split("=") - config["slurm"][k] = v - except Exception as e: - print("Slurm config creation exception", e) - finally: - return config - - -def should_continue(config): - """ - Assuming runs are consistently executed in a `run_dir` with the - `run_dir/$SLURM_JOBID` pattern, this functions looks for an existing - directory with the same $SLURM_JOBID as the current job that contains - a checkpoint. - - If there is one, it tries to find `best_checkpoint.ckpt`. - If the latter does not exist, it looks for the latest checkpoint, - assuming a naming convention like `checkpoint-{step}.pt`. - - If a checkpoint is found, its path is set in `config["checkpoint"]`. - Otherwise, returns the original config. - - Args: - config (dict): The original config to overwrite - - Returns: - dict: The updated config if a checkpoint has been found - """ - if config.get("checkpoint"): - return config - - job_id = os.environ.get("SLURM_JOBID") - if job_id is None: - return config - - base_dir = Path(config["run_dir"]).resolve().parent - ckpt_dir = base_dir / job_id / "checkpoints" - if not ckpt_dir.exists() or not ckpt_dir.is_dir(): - return config - - best_ckp = ckpt_dir / "best_checkpoint.pt" - if best_ckp.exists(): - config["checkpoint"] = str(best_ckp) - else: - ckpts = list(ckpt_dir.glob("checkpoint-*.pt")) - if not ckpts: - return config - latest_ckpt = sorted(ckpts, key=lambda f: f.stem)[-1] - if latest_ckpt.exists() and latest_ckpt.is_file(): - config["checkpoint"] = str(latest_ckpt) - - return config - - def print_warnings(): warnings = [ "`max_num_neighbors` is set to 40. This should be tuned per model.", @@ -144,6 +72,10 @@ def __init__(self, trainer_config): def run(self, **hparams): self.original_config = copy.deepcopy(self.trainer_config) + if distutils.is_master(): + orion_trial = hparams.pop("orion_trial", None) + if orion_trial: + hparams["orion_hash_params"] = orion_trial.hash_params self.hparams = hparams should_be_0 = distutils.get_rank() @@ -158,6 +90,7 @@ def run(self, **hparams): print(hparams) self.trainer_config = merge_dicts(self.trainer_config, hparams) + self.trainer_config = continue_orion_exp(self.trainer_config) cls = registry.get_trainer_class(self.trainer_config["trainer"]) self.trainer: BaseTrainer = cls(**self.trainer_config) task = registry.get_task_class(self.trainer_config["mode"])(self.trainer_config) @@ -224,28 +157,23 @@ def run(self, **hparams): # ------------------- setup_imports() print("All things imported.") - trainer_config = should_continue(trainer_config) + trainer_config = continue_from_slurm_job_id(trainer_config) trainer_config = read_slurm_env(trainer_config) runner = Runner(trainer_config) print("Runner ready.") # ------------------- # ----- Train ----- # ------------------- - if args.orion_search and distutils.is_master(): - assert args.unique_exp_name - space = safe_load(Path(args.orion_search).read_text()) + if args.orion_search_path and distutils.is_master(): + assert args.orion_unique_exp_name + space = safe_load(Path(args.orion_search_path).read_text()) print("Search Space: ", space) experiment = build_experiment( - name=args.unique_exp_name, + name=args.orion_unique_exp_name, space=space, algorithms={"mofa": {"seed": 123}}, ) - experiment.workon( - runner.run, - max_trials_per_worker=1, - n_workers=1, - idle_timeout=3600 * 24 * 4, - ) + experiment.workon(runner.run, max_trials_per_worker=1, n_workers=1) else: print("Starting runner.") runner.run() diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index fe2a524c46..cd4f426032 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -43,6 +43,139 @@ JOB_ID = os.environ.get("SLURM_JOB_ID") +def continue_orion_exp(trainer_config): + if not trainer_config.get("orion_search_path") or not trainer_config.get( + "orion_unique_exp_name" + ): + return trainer_config + + if "orion_hash_params" not in trainer_config: + faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml" + print( + "\n\nWARNING: trainer_config has 'orion_search_path' and 'orion_unique_exp_name'", + "but no 'orion_trial'. This can lead to inconsistencies.", + f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n", + ) + faulty_path.write_text(yaml.dump(trainer_config)) + return trainer_config + + hash_params = trainer_config["orion_hash_params"] + exp_name = trainer_config["orion_unique_exp_name"] + id_file = f"{exp_name}--{hash_params}.unique" + (Path(trainer_config["run_dir"]) / id_file).touch() + base_dir = Path(trainer_config["run_dir"]).parent + existing_id_files = list(base_dir.glob(f"*/{id_file}")) + + if not existing_id_files: + return trainer_config + + latest_dirs = sorted( + [ + f.parent + for f in existing_id_files + if float(f.parent.name) != float(trainer_config["job_id"]) + ], + key=lambda f: float(f.name), + ) + + if not latest_dirs: + return trainer_config + + latest_ckpts = sorted( + [f for f in (latest_dirs[-1] / "checkpoints").glob("checkpoint-*")], + key=lambda f: float(f.stem.split("-")[-1]), + ) + + if not latest_ckpts: + raise ValueError(f"No checkpoint found in {str(latest_dirs[-1])}") + trainer_config["checkpoint"] = str(latest_ckpts[-1]) + print( + f"\nFound {len(latest_ckpts)} existing Orion runs.", + "Resuming from latest:", + str(latest_dirs[-1]), + ) + print("Based on unique file id:", id_file) + print("Continuing from checkpoint:", trainer_config["checkpoint"], end="\n\n") + return trainer_config + + +def read_slurm_env(config): + """ + Parses the output of `scontrol show` in order to store the slurm + config (mem, cpu, node, gres) as a `"slurm"` key in the `config` object. + + Args: + config (dict): Run configuration + + Returns: + dict: Updated run config if no "slurm" key exists or it's empty + """ + if not config.get("slurm"): + return config + + command = f"scontrol show job {JOB_ID}" + scontrol = subprocess.check_output(command.split(" ")).decode("utf-8").strip() + params = re.findall(r"TRES=(.+)\n", scontrol) + try: + if params: + params = params[0] + for kv in params.split(","): + k, v = kv.split("=") + config["slurm"][k] = v + except Exception as e: + print("Slurm config creation exception", e) + finally: + return config + + +def continue_from_slurm_job_id(config): + """ + Assuming runs are consistently executed in a `run_dir` with the + `run_dir/$SLURM_JOBID` pattern, this functions looks for an existing + directory with the same $SLURM_JOBID as the current job that contains + a checkpoint. + + If there is one, it tries to find `best_checkpoint.ckpt`. + If the latter does not exist, it looks for the latest checkpoint, + assuming a naming convention like `checkpoint-{step}.pt`. + + If a checkpoint is found, its path is set in `config["checkpoint"]`. + Otherwise, returns the original config. + + Args: + config (dict): The original config to overwrite + + Returns: + dict: The updated config if a checkpoint has been found + """ + if config.get("checkpoint"): + return config + + job_id = os.environ.get("SLURM_JOBID") + if job_id is None: + return config + + base_dir = Path(config["run_dir"]).resolve().parent + ckpt_dir = base_dir / job_id / "checkpoints" + if not ckpt_dir.exists() or not ckpt_dir.is_dir(): + return config + + best_ckp = ckpt_dir / "best_checkpoint.pt" + if best_ckp.exists(): + config["checkpoint"] = str(best_ckp) + else: + ckpts = list(ckpt_dir.glob("checkpoint-*.pt")) + if not ckpts: + return config + latest_ckpt = sorted( + ckpts, key=lambda f: float(f.stem.split("checkpoint-")[-1]) + )[-1] + if latest_ckpt.exists() and latest_ckpt.is_file(): + config["checkpoint"] = str(latest_ckpt) + + return config + + def move_lmdb_data_to_slurm_tmpdir(trainer_config): if ( not trainer_config.get("cp_data_to_tmpdir") From 99ecab351e54030e8d19c0237d5feb3c0ab5658c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 9 Jan 2023 18:15:14 -0500 Subject: [PATCH 040/273] v0 orion exp launch --- launch_exp.py | 47 ++++++++++++++++++++++++++++++++++----- ocpmodels/common/flags.py | 2 +- 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index 9196bfb883..7c0e9e7303 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -5,11 +5,13 @@ from pathlib import Path from minydra import resolved_args -from yaml import safe_load +from yaml import safe_load, dump from sbatch import now import copy +ROOT = Path(__file__).resolve().parent + def util_strings(jobs, yaml_comments=False): s = "All jobs launched: " + ", ".join(jobs) @@ -103,7 +105,7 @@ def get_commit(): def find_exp(name): - exp_dir = Path(__file__).parent / "configs" / "exps" + exp_dir = ROOT / "configs" / "exps" exp_file = exp_dir / f"{name}.yaml" if exp_file.exists(): return exp_file @@ -139,16 +141,51 @@ def cli_arg(args, key=""): if __name__ == "__main__": + orion_conf = ROOT / "data" / "orion" / "orion_config.yaml" args = resolved_args() assert "exp" in args regex = args.get("match", ".*") + ts = now() exp_name = args.exp.replace(".yml", "").replace(".yaml", "") exp_file = find_exp(exp_name) exp = safe_load(exp_file.open("r")) - runs = exp["runs"] + if "orion" in exp: + assert "runs" not in exp, "Cannot use both Orion and runs" + assert ( + "orion_unique_exp_name" in exp + ), "Must specify 'orion_unique_exp_name' in exp file" + if not orion_conf.exists(): + orion_conf.write_text( + dump( + { + "storage": { + "database": { + "host": str(orion_conf.parent / "orion_db.pkl"), + "type": "pickleddb", + } + } + } + ) + ) + search_path = ( + orion_conf.parent + / "search-spaces" + / f"{ts}-{exp['orion_unique_exp_name']}.yaml" + ) + search_path.parent.mkdir(exist_ok=True, parents=True) + assert not search_path.exists() + search_path.write_text(dump(exp["orion"])) + runs = [ + { + "orion_search_path": str(search_path), + "orion_unique_exp_name": exp["orion_unique_exp_name"], + } + ] + else: + runs = exp["runs"] commands = [] @@ -191,8 +228,8 @@ def cli_arg(args, key=""): print(f"Launching job {c:3}", end="\r") or os.popen(command).read().strip() for c, command in enumerate(commands) ] - outdir = Path(__file__).resolve().parent / "data" / "exp_outputs" / exp_name - outfile = outdir / f"{exp_name.split('/')[-1]}_{now()}.txt" + outdir = ROOT / "data" / "exp_outputs" / exp_name + outfile = outdir / f"{exp_name.split('/')[-1]}_{ts}.txt" outfile.parent.mkdir(exist_ok=True, parents=True) text += separator.join(outputs) jobs = [ diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 354b52ee75..9828598ca4 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -241,7 +241,7 @@ def add_core_args(self): help="Log training loss every n steps", ) self.parser.add_argument( - "--orion_search", + "--orion_search_path", "-o", type=str, help="Path to an orion search space yaml file", From 0a0d8a911c3b2a61693ac6c7afbe2903b5936050 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Mon, 9 Jan 2023 18:22:11 -0500 Subject: [PATCH 041/273] update config files --- configs/exps/gnn/test-gnn-all-1.yaml | 6 ++--- configs/exps/icml/baseline_s2ef.yaml | 20 +++++++++------- configs/exps/prop-check/symmetries.yaml | 24 +++++++++---------- .../exps/prop-check/symmetries_s2ef_2.yaml | 21 ++++++++++++---- configs/models/dpp.yaml | 2 +- configs/models/fanet.yaml | 4 ++-- ocpmodels/trainers/single_trainer.py | 2 +- 7 files changed, 46 insertions(+), 33 deletions(-) diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml index ea5dd8ec56..00a54ada07 100644 --- a/configs/exps/gnn/test-gnn-all-1.yaml +++ b/configs/exps/gnn/test-gnn-all-1.yaml @@ -17,7 +17,7 @@ default: energy_head: 'weighted-av-initial-embeds' # False ? wandb_tags: 'test-fanet' optim: - lr_initial: 0.0008 + lr_initial: 0.0005 runs: - config: fanet-is2re-all @@ -61,7 +61,7 @@ runs: mp_type: updownscale edge_embed_type: all optim: - lr_initial: 0.0007 + lr_initial: 0.0004 max_epochs: 25 - config: fanet-is2re-all @@ -75,7 +75,7 @@ runs: frame_averaging: 2D fa_fames: random optim: - lr_initial: 0.0005 + lr_initial: 0.0004 max_epochs: 25 - config: fanet-is2re-all diff --git a/configs/exps/icml/baseline_s2ef.yaml b/configs/exps/icml/baseline_s2ef.yaml index 154a4c07a5..cefff0fb5e 100644 --- a/configs/exps/icml/baseline_s2ef.yaml +++ b/configs/exps/icml/baseline_s2ef.yaml @@ -1,25 +1,27 @@ job: mem: 48GB cpus: 4 - gres: gpu:rtx8000:1 + gres: gpu:rtx8000:2 partition: long time: 42:00:00 default: test_ri: True mode: train - wandb_tags: 'Baseline dpp 1 Gpus' + wandb_tags: 'baseline-schnet' runs: - - config: dpp-s2ef-2M - note: 'Baseline Schnet S2EF' - optim: - batch_size: 368 - eval_batch_size: 368 - config: schnet-s2ef-2M note: 'Baseline Schnet S2EF 2 GPU' optim: max_epochs: 15 force_coefficient: 50 - batch_size: 192 - eval_batch_size: 192 + batch_size: 96 + eval_batch_size: 96 + - config: schnet-is2re-2M + note: 'Baseline Schnet IS2RE 2 GPU' + optim: + max_epochs: 15 + force_coefficient: 50 + batch_size: 128 + eval_batch_size: 128 diff --git a/configs/exps/prop-check/symmetries.yaml b/configs/exps/prop-check/symmetries.yaml index 0b26ce2d81..1da0808595 100644 --- a/configs/exps/prop-check/symmetries.yaml +++ b/configs/exps/prop-check/symmetries.yaml @@ -21,69 +21,69 @@ default: runs: - config: sfarinet-s2ef-2M note: 'Baseline 5 epochs' - model: + model: regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 2D fa_frames: all - model: + model: regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 3D fa_frames: all - model: + model: regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: DA - model: + model: regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 2D fa_frames: det - model: + model: regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 2D fa_frames: se3-det - model: + model: regress_forces: from_energy - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 2D fa_frames: all - model: + model: regress_forces: direct - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 2D fa_frames: se3-random - model: + model: regress_forces: direct - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: 2D fa_frames: all - model: + model: regress_forces: direct_with_gradient_target - config: sfarinet-s2ef-2M note: 'Test Force Equivariance' frame_averaging: DA - model: + model: regress_forces: direct_with_gradient_target - config: sfarinet-s2ef-2M note: 'No forces coefficient ! Only energy' - model: + model: regress_forces: direct force_coefficient: 0 energy_grad_coefficient: 10 - config: sfarinet-s2ef-2M note: 'Large energy grad coef' frame_averaging: DA - model: + model: regress_forces: direct_with_gradient_target energy_grad_coefficient: 50 diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml index 9abfc02b40..aebe1b7934 100644 --- a/configs/exps/prop-check/symmetries_s2ef_2.yaml +++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml @@ -1,9 +1,9 @@ job: mem: 48GB cpus: 4 - gres: gpu:rtx8000:4 + gres: gpu:rtx8000:1 partition: long - time: 40:00:00 + time: 20:00:00 default: test_ri: True @@ -16,17 +16,19 @@ default: energy_head: False # False ? optim: max_epochs: 5 - wandb_tags: 'prop-check-ICLM' + batch_size: 196 + eval_batch_size: 196 + wandb_tags: 's2ef-sym-prop' runs: - config: sfarinet-s2ef-2M - note: 'Baseline 5 epochs' + note: 'Baseline 5 epochs 1 Gpu' frame_averaging: 2D fa_frames: all model: regress_forces: from_energy - config: sfarinet-s2ef-2M - note: 'Baseline 5 epochs' + note: 'Baseline 5 epochs 1 Gpu' frame_averaging: 3D fa_frames: all model: @@ -75,3 +77,12 @@ runs: energy_grad_coefficient: 100 force_coefficient: 30 energy_coefficient: 0 + - config: sfarinet-s2ef-2M + note: 'Large force coef' + frame_averaging: 2D + fa_frames: random + model: + regress_forces: direct_with_gradient_target + optim: + force_coefficient: 75 + energy_coefficient: 1 \ No newline at end of file diff --git a/configs/models/dpp.yaml b/configs/models/dpp.yaml index 4b973595f4..6a289bbc22 100644 --- a/configs/models/dpp.yaml +++ b/configs/models/dpp.yaml @@ -119,7 +119,7 @@ s2ef: - 41666 warmup_steps: 10416 warmup_factor: 0.2 - max_epochs: 5 + max_epochs: 15 force_coefficient: 50 model: hidden_channels: 192 diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml index 5bf8ad3546..b04d7dfba7 100644 --- a/configs/models/fanet.yaml +++ b/configs/models/fanet.yaml @@ -85,7 +85,7 @@ is2re: # ----- S2EF ----- # ------------------ -# For 4 GPUs +# For 4 GPUs s2ef: default: @@ -115,7 +115,7 @@ s2ef: # 2 gpus 2M: - model: + model: num_interactions: 5 hidden_channels: 1024 num_gaussians: 200 diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index e7e062ebb3..6f9234d45a 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -684,7 +684,7 @@ def test_model_symmetries(self, debug_batches=-1): # Compute total difference across frames for pos1, pos2 in zip(batch[0].fa_pos, rotated["batch_list"][0].fa_pos): pos_diff += pos1 - pos2 - # Manhanttan distance of pos matrix wrt 0 matrix. + # Manhattan distance of pos matrix wrt 0 matrix. pos_diff_total += torch.abs(pos_diff).sum() # Reflect graph and compute diff in prediction From 30eb6b7a17c49a10fb50dbecaf852f65e498c7f7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 01:05:01 -0500 Subject: [PATCH 042/273] improve Orion setup --- configs/exps/debug/orion.yaml | 55 +++++++++++++++++++++++++++++++++++ launch_exp.py | 43 +++++++++++---------------- main.py | 11 ++++++- 3 files changed, 82 insertions(+), 27 deletions(-) create mode 100644 configs/exps/debug/orion.yaml diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml new file mode 100644 index 0000000000..e913e42cef --- /dev/null +++ b/configs/exps/debug/orion.yaml @@ -0,0 +1,55 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 48GB + cpus: 4 + gres: gpu:16gb:1 + time: 1:00:00 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 + env: ocp-a100 + +default: + wandb_project: ocp-qm + config: schnet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, orion-debug + phys_hidden_channels: 0 + phys_embeds: False + energy_head: False + pg_hidden_channels: 0 + tag_hidden_channels: 0 + frame_averaging: "" + cp_data_to_tmpdir: true + optim: + batch_size: 64 + warmup_steps: 3000 + lr_initial: 0.0002 + # parameters EMA + ema_decay: 0.999 + # exp. decay to 0.01 * lr_initial in 1000000 steps + decay_steps: max_steps + decay_rate: 0.05 # at the end of training, lr is decay_rate*lr_initial + # max_epochs = ref_steps[3e6] / (n_train[110 000] / ref_batch_size[32]) + max_epochs: -1 + max_steps: 3000000 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + +orion: + # Remember to change the experiment name if you change anything in the search space + _meta_: + n_runs: 2 + unique_exp_name: ocp-qm9-orion-debug + optim: + batch_size: uniform(32, 1024, discrete=True) + lr_initial: loguniform(1e-5, 5e-3, precision=2) + max_steps: fidelity(1e4, 1e6, base=5e5) + model: + num_gaussians: uniform(16, 200, base=20, discrete=True) + hidden_channels: uniform(32, 512, discrete=True) + num_filters: uniform(32, 512, discrete=True) + num_interactions: uniform(1, 7, discrete=True) + phys_embeds: choices([True, False]) \ No newline at end of file diff --git a/launch_exp.py b/launch_exp.py index 7c0e9e7303..34999ce607 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -80,12 +80,14 @@ def write_exp_yaml_and_jobs(exp_file, outfile, jobs): jobs (list[str]): List of jobs, one per run line in the yaml exp_file """ lines = exp_file.read_text().splitlines() - run_line = lines.index("runs:") - j = 0 - for i, line in enumerate(lines[run_line:]): - if line.strip().startswith("- "): - lines[run_line + i] = f"{line} # {jobs[j]}" - j += 1 + if "runs:" in lines: + run_line = lines.index("runs:") + j = 0 + for i, line in enumerate(lines[run_line:]): + if line.strip().startswith("- "): + lines[run_line + i] = f"{line} # {jobs[j]}" + j += 1 + lines += [""] + util_strings(jobs, True).splitlines() yml_out = outfile.with_suffix(".yaml") yml_out.write_text("\n".join(lines)) @@ -141,7 +143,6 @@ def cli_arg(args, key=""): if __name__ == "__main__": - orion_conf = ROOT / "data" / "orion" / "orion_config.yaml" args = resolved_args() assert "exp" in args regex = args.get("match", ".*") @@ -153,27 +154,16 @@ def cli_arg(args, key=""): exp = safe_load(exp_file.open("r")) if "orion" in exp: + orion_base = ROOT / "data" / "orion" assert "runs" not in exp, "Cannot use both Orion and runs" + meta = exp["orion"].pop("_meta_", {}) assert ( - "orion_unique_exp_name" in exp - ), "Must specify 'orion_unique_exp_name' in exp file" - if not orion_conf.exists(): - orion_conf.write_text( - dump( - { - "storage": { - "database": { - "host": str(orion_conf.parent / "orion_db.pkl"), - "type": "pickleddb", - } - } - } - ) - ) + "unique_exp_name" in meta + ), "Must specify 'orion._meta_.unique_exp_name' in exp file" + assert "n_runs" in meta, "Must specify 'orion._meta_.n_runs' in exp file" + search_path = ( - orion_conf.parent - / "search-spaces" - / f"{ts}-{exp['orion_unique_exp_name']}.yaml" + orion_base / "search-spaces" / f"{ts}-{meta['unique_exp_name']}.yaml" ) search_path.parent.mkdir(exist_ok=True, parents=True) assert not search_path.exists() @@ -181,8 +171,9 @@ def cli_arg(args, key=""): runs = [ { "orion_search_path": str(search_path), - "orion_unique_exp_name": exp["orion_unique_exp_name"], + "orion_unique_exp_name": meta["unique_exp_name"], } + for _ in range(meta["n_runs"]) ] else: runs = exp["runs"] diff --git a/main.py b/main.py index da04d5cd93..2fc13171e3 100644 --- a/main.py +++ b/main.py @@ -24,6 +24,7 @@ from ocpmodels.common.registry import registry from ocpmodels.common.utils import ( JOB_ID, + ROOT, build_config, continue_from_slurm_job_id, continue_orion_exp, @@ -169,9 +170,17 @@ def run(self, **hparams): space = safe_load(Path(args.orion_search_path).read_text()) print("Search Space: ", space) experiment = build_experiment( + storage={ + "database": { + "host": str( + ROOT / "data" / "orion" / "storage" / "orion_db.pkl" + ), + "type": "pickleddb", + } + }, name=args.orion_unique_exp_name, space=space, - algorithms={"mofa": {"seed": 123}}, + algorithms={"asha": {"seed": 123}}, ) experiment.workon(runner.run, max_trials_per_worker=1, n_workers=1) else: From dc3ddca37a449e3e3842c7a32618b0bd05b43fd1 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 01:14:26 -0500 Subject: [PATCH 043/273] update flags --- configs/exps/debug/orion.yaml | 8 ++++---- ocpmodels/common/flags.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml index e913e42cef..1e23cf9299 100644 --- a/configs/exps/debug/orion.yaml +++ b/configs/exps/debug/orion.yaml @@ -1,11 +1,11 @@ # more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij job: - mem: 48GB + mem: 32GB cpus: 4 - gres: gpu:16gb:1 + gres: gpu:1 time: 1:00:00 - partition: long - code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 + partition: main + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 env: ocp-a100 default: diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 9828598ca4..8142c80858 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -247,7 +247,7 @@ def add_core_args(self): help="Path to an orion search space yaml file", ) self.parser.add_argument( - "--unique_exp_name", + "--orion_unique_exp_name", "-u", type=str, help="Name for this experiment. If the experiment name already exists," From 4356249038064c5c59626328a88ff41cbf2a3002 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 02:30:31 -0500 Subject: [PATCH 044/273] orion v0.1 --- configs/exps/debug/orion.yaml | 6 ++-- main.py | 49 ++++++++++++++-------------- ocpmodels/common/utils.py | 5 +-- ocpmodels/trainers/single_trainer.py | 2 +- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml index 1e23cf9299..4a02537aa9 100644 --- a/configs/exps/debug/orion.yaml +++ b/configs/exps/debug/orion.yaml @@ -5,7 +5,7 @@ job: gres: gpu:1 time: 1:00:00 partition: main - code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab env: ocp-a100 default: @@ -42,13 +42,13 @@ orion: # Remember to change the experiment name if you change anything in the search space _meta_: n_runs: 2 - unique_exp_name: ocp-qm9-orion-debug + unique_exp_name: ocp-qm9-orion-debug-v0.0.2 optim: batch_size: uniform(32, 1024, discrete=True) lr_initial: loguniform(1e-5, 5e-3, precision=2) max_steps: fidelity(1e4, 1e6, base=5e5) model: - num_gaussians: uniform(16, 200, base=20, discrete=True) + num_gaussians: uniform(16, 200, discrete=True) hidden_channels: uniform(32, 512, discrete=True) num_filters: uniform(32, 512, discrete=True) num_interactions: uniform(1, 7, discrete=True) diff --git a/main.py b/main.py index 2fc13171e3..03c4401b38 100644 --- a/main.py +++ b/main.py @@ -8,8 +8,6 @@ import copy import logging import os -import re -import subprocess import time import traceback import warnings @@ -70,27 +68,29 @@ class Runner: def __init__(self, trainer_config): self.trainer_config = trainer_config self.trainer = None + self.hparams = {} - def run(self, **hparams): + def run(self, orion_exp=None): + orion_trial = None self.original_config = copy.deepcopy(self.trainer_config) if distutils.is_master(): - orion_trial = hparams.pop("orion_trial", None) - if orion_trial: - hparams["orion_hash_params"] = orion_trial.hash_params - self.hparams = hparams + if orion_exp: + orion_trial = orion_exp.suggest(1) + self.hparams = orion_trial.params + self.hparams["orion_hash_params"] = orion_trial.hash_params should_be_0 = distutils.get_rank() - hp_list = [hparams, should_be_0] + hp_list = [self.hparams, should_be_0] # print("hparams pre-broadcast: ", hparams) distutils.broadcast_object_list(hp_list) - hparams, should_be_0 = hp_list + self.hparams, should_be_0 = hp_list # print("hparams post-broadcast: ", hparams) assert should_be_0 == 0 - if hparams: + if self.hparams: print("Received hyper-parameters from Orion:") - print(hparams) + print(self.hparams) - self.trainer_config = merge_dicts(self.trainer_config, hparams) + self.trainer_config = merge_dicts(self.trainer_config, self.hparams) self.trainer_config = continue_orion_exp(self.trainer_config) cls = registry.get_trainer_class(self.trainer_config["trainer"]) self.trainer: BaseTrainer = cls(**self.trainer_config) @@ -118,7 +118,8 @@ def run(self, **hparams): objective = o_list[0] # print("objective post-broadcast: ", objective) - return [{"name": "energy_mae", "type": "objective", "value": objective}] + if orion_exp is not None: + orion_exp.observe(orion_trial, objective, name="energy_mae") if __name__ == "__main__": @@ -141,7 +142,6 @@ def run(self, **hparams): trainer_config = build_config(args, override_args) trainer_config["optim"]["eval_batch_size"] = trainer_config["optim"]["batch_size"] - setup_logging() original_trainer_config = copy.deepcopy(trainer_config) if args.distributed: @@ -152,16 +152,17 @@ def run(self, **hparams): trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config) # distutils.synchronize() + # ------------------- + # ----- Setup ----- + # ------------------- + setup_imports() + print("All things imported.") + trainer_config = continue_from_slurm_job_id(trainer_config) + trainer_config = read_slurm_env(trainer_config) + runner = Runner(trainer_config) + print("Runner ready.") + try: - # ------------------- - # ----- Setup ----- - # ------------------- - setup_imports() - print("All things imported.") - trainer_config = continue_from_slurm_job_id(trainer_config) - trainer_config = read_slurm_env(trainer_config) - runner = Runner(trainer_config) - print("Runner ready.") # ------------------- # ----- Train ----- # ------------------- @@ -182,7 +183,7 @@ def run(self, **hparams): space=space, algorithms={"asha": {"seed": 123}}, ) - experiment.workon(runner.run, max_trials_per_worker=1, n_workers=1) + runner.run(orion_exp=experiment) else: print("Starting runner.") runner.run() diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index cd4f426032..35a63ceead 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -52,8 +52,9 @@ def continue_orion_exp(trainer_config): if "orion_hash_params" not in trainer_config: faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml" print( - "\n\nWARNING: trainer_config has 'orion_search_path' and 'orion_unique_exp_name'", - "but no 'orion_trial'. This can lead to inconsistencies.", + "\n\nWARNING: trainer_config has 'orion_search_path' and", + "'orion_unique_exp_name' but no 'orion_hash_params'.", + "This can lead to inconsistencies.", f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n", ) faulty_path.write_text(yaml.dump(trainer_config)) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 6f9234d45a..e91a6ea9ba 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -229,7 +229,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): if self.sigterm: return "SIGTERM" i_for_epoch += 1 - print(self.now, "i_for_epoch: ", i_for_epoch, flush=True) + # print(self.now, "i_for_epoch: ", i_for_epoch, flush=True) self.epoch = epoch_int + (i + 1) / n_train self.step = epoch_int * n_train + i + 1 From 93d5f139145cf2c2afe5c9aac1f4d5dfcf7c2225 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 02:38:54 -0500 Subject: [PATCH 045/273] missing args to end_of_training() calls --- ocpmodels/trainers/single_trainer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index e91a6ea9ba..ec30642ca4 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -331,7 +331,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n") if self.logger: self.logger.add_tags(["E-S"]) - return self.end_of_training() + return self.end_of_training( + epoch_int, debug_batches, model_run_time, epoch_times + ) self.model.train() @@ -350,7 +352,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): # End of training. if not is_test_env: - return self.end_of_training() + return self.end_of_training( + epoch_int, debug_batches, model_run_time, epoch_times + ) def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times): From 182e081273a8249a414632419a880d7b16ad2c43 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 02:53:43 -0500 Subject: [PATCH 046/273] validate on test for qm9 --- configs/models/tasks/qm9.yaml | 1 + ocpmodels/common/flags.py | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml index 42c64032f8..262ec232e7 100644 --- a/configs/models/tasks/qm9.yaml +++ b/configs/models/tasks/qm9.yaml @@ -1,6 +1,7 @@ default: trainer: single logger: wandb + eval_on_test: True model: otf_graph: False diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 8142c80858..e1d19fbb5f 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -218,8 +218,7 @@ def add_core_args(self): ) self.parser.add_argument( "--eval_on_test", - action="store_true", - default=False, + type=bool, help="Evaluate on test set", ) self.parser.add_argument( From 512400847a3cf74fc4a15bf1346d12d68b71490a Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 10 Jan 2023 06:38:50 -0500 Subject: [PATCH 047/273] add edge embed option to sfarinet --- configs/exps/gnn/s2ef_1gpu.yaml | 47 +++++++++++++++++++ configs/models/sfarinet.yaml | 7 ++- ocpmodels/models/sfarinet.py | 81 ++++++++++++++++++++++++++++----- 3 files changed, 123 insertions(+), 12 deletions(-) create mode 100644 configs/exps/gnn/s2ef_1gpu.yaml diff --git a/configs/exps/gnn/s2ef_1gpu.yaml b/configs/exps/gnn/s2ef_1gpu.yaml new file mode 100644 index 0000000000..4decd7316f --- /dev/null +++ b/configs/exps/gnn/s2ef_1gpu.yaml @@ -0,0 +1,47 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 30:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + cp_data_to_tmp: true + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + regress_forces: direct_with_gradient_target + wandb_tags: 's2ef-archi-tests' + optim: + max_epochs: 10 + batch_size: 192 + eval_batch_size: 192 + +runs: + - config: sfarinet-s2ef-2M + note: 'Bigger forces coef' + frame_averaging: 2D + fa_fames: se3-random + optim: + force_coefficient: 75 + - config: sfarinet-s2ef-2M + note: 'Bigger forces coef' + frame_averaging: 2D + fa_fames: se3-random + model: + regress_forces: direct + optim: + force_coefficient: 75 + - config: sfarinet-s2ef-2M + note: 'Bigger forces coef' + frame_averaging: 2D + fa_fames: se3-random + model: + regress_forces: direct + optim: + force_coefficient: 75 diff --git a/configs/models/sfarinet.yaml b/configs/models/sfarinet.yaml index 57bc1afdec..0e2d82d993 100644 --- a/configs/models/sfarinet.yaml +++ b/configs/models/sfarinet.yaml @@ -15,6 +15,7 @@ default: phys_embeds: False # True phys_hidden_channels: 0 energy_head: False # can be {False, weighted-av-initial-embeds, weighted-av-final-embeds, pooling, graclus, random} + edge_embed_type: "" force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True force_decoder_model_config: simple: @@ -89,7 +90,7 @@ is2re: # ----- S2EF ----- # ------------------ -# For 4 GPUs +# For 4 GPUs s2ef: default: @@ -123,6 +124,10 @@ s2ef: all: {} +# ------------------ +# ----- QM9 ----- +# ------------------ + qm9: default: model: diff --git a/ocpmodels/models/sfarinet.py b/ocpmodels/models/sfarinet.py index f55fc414aa..da3ad2a985 100644 --- a/ocpmodels/models/sfarinet.py +++ b/ocpmodels/models/sfarinet.py @@ -2,18 +2,19 @@ """ import torch +from e3nn.o3 import spherical_harmonics from torch import nn from torch.nn import Embedding, Linear from torch_geometric.nn import MessagePassing, radius_graph from torch_scatter import scatter from ocpmodels.common.registry import registry -from ocpmodels.common.utils import get_pbc_distances, conditional_grad +from ocpmodels.common.utils import conditional_grad, get_pbc_distances from ocpmodels.models.base_model import BaseModel +from ocpmodels.models.force_decoder import ForceDecoder from ocpmodels.models.utils.pos_encodings import PositionalEncoding from ocpmodels.modules.phys_embeddings import PhysEmbedding from ocpmodels.modules.pooling import Graclus, Hierarchical_Pooling -from ocpmodels.models.force_decoder import ForceDecoder try: from torch_geometric.nn.acts import swish @@ -47,6 +48,7 @@ def __init__( phys_embeds, graph_rewiring, act, + edge_embed_type, ): super().__init__() self.act = act @@ -58,6 +60,7 @@ def __init__( "one-supernode-per-atom-type", "one-supernode-per-atom-type-dist", } + self.edge_embed_type = edge_embed_type # Phys embeddings self.phys_emb = PhysEmbedding( @@ -97,7 +100,24 @@ def __init__( # MLP self.lin = Linear(hidden_channels, hidden_channels) - self.lin_e = Linear(num_gaussians + 3, hidden_channels) + + # --- Edge embedding --- + if self.edge_embed_type == "": + self.lin_e = Linear(num_gaussians + 3, hidden_channels) + elif self.edge_embed_type == "rij": + self.lin_e = Linear(3, hidden_channels) + elif self.edge_embed_type == "all_rij": + self.lin_e = Linear(3, hidden_channels // 3) # r_ij + self.lin_e2 = Linear(3, hidden_channels // 3) # norm r_ij + self.lin_e3 = Linear( + num_gaussians, hidden_channels - 2 * (hidden_channels // 3) + ) # d_ij + elif self.edge_embed_type == "sh": + self.lin_e = Linear(15, hidden_channels) + elif self.edge_embed_type == "all": + self.lin_e = Linear(18, hidden_channels) + else: + raise ValueError("edge_embedding_type does not exist") self.reset_parameters() @@ -114,11 +134,47 @@ def reset_parameters(self): self.lin.bias.data.fill_(0) nn.init.xavier_uniform_(self.lin_e.weight) self.lin_e.bias.data.fill_(0) + if self.edge_embed_type == "all_rij": + nn.init.xavier_uniform_(self.lin_e2.weight) + self.lin_e2.bias.data.fill_(0) + nn.init.xavier_uniform_(self.lin_e3.weight) + self.lin_e3.bias.data.fill_(0) + + def forward( + self, z, rel_pos, edge_attr, tag=None, normalised_rel_pos=None, subnodes=None + ): + + # --- Edge embedding -- + + if self.edge_embed_type == "rij": + e = self.lin_e(rel_pos) + elif self.edge_embed_type == "all_rij": + rel_pos = self.lin_e(rel_pos) # r_ij + normalized_rel_pos = self.lin_e2(normalised_rel_pos) # norm r_ij + edge_attr = self.lin_e3(edge_attr) # d_ij + e = torch.cat((rel_pos, edge_attr, normalized_rel_pos), dim=1) + elif self.edge_embed_type == "sh": + self.sh = spherical_harmonics( + l=[1, 2, 3], + x=normalised_rel_pos, + normalize=False, + normalization="component", + ) + e = self.lin_e(self.sh) + elif self.edge_embed_type == "all": + self.sh = spherical_harmonics( + l=[1, 2, 3], + x=normalised_rel_pos, + normalize=False, + normalization="component", + ) + e = torch.cat((rel_pos, self.sh), dim=1) + e = self.lin_e(e) + else: + e = torch.cat((rel_pos, edge_attr), dim=1) + e = self.lin_e(e) - def forward(self, z, rel_pos, edge_attr, tag=None, subnodes=None): - # Create edge embeddings from d_ij || r_ij - e = torch.cat((rel_pos, edge_attr), dim=1) - # Extension: learn a bond feature vector and concat to above + # --- Atom embedding -- # Create atom embeddings based on its characteristic number h = self.emb(z) @@ -153,7 +209,6 @@ def forward(self, z, rel_pos, edge_attr, tag=None, subnodes=None): # Apply MLP h = self.lin(h) - e = self.lin_e(e) return h, e @@ -269,6 +324,7 @@ class SfariNet(BaseModel): force_decoder_type (str): Type of force decoder to use. force_decoder_model_config (dict): Dictionary of config parameters for the decoder's model + edge_embed_type (str): type of edge_embedding """ def __init__(self, **kwargs): @@ -279,6 +335,7 @@ def __init__(self, **kwargs): self.max_num_neighbors = kwargs["max_num_neighbors"] self.regress_forces = kwargs["regress_forces"] self.energy_head = kwargs["energy_head"] + self.edge_embed_type = kwargs["edge_embed_type"] self.distance_expansion = GaussianSmearing( 0.0, self.cutoff, kwargs["num_gaussians"] @@ -297,6 +354,7 @@ def __init__(self, **kwargs): kwargs["phys_embeds"], kwargs["graph_rewiring"], self.act, + kwargs["edge_embed_type"], ) # Interaction block @@ -382,13 +440,14 @@ def energy_forward(self, data): edge_attr = self.distance_expansion(edge_weight) # Normalize and squash to [0,1] for gaussian basis - rel_pos_normalized = rel_pos / edge_weight.view(-1, 1) - rel_pos_normalized = (rel_pos_normalized + 1) / 2.0 + rel_pos_normalized = None + if self.edge_embed_type in {"sh", "all_rij", "all"}: + rel_pos_normalized = (rel_pos / edge_weight.view(-1, 1) + 1) / 2.0 pooling_loss = None # deal with pooling loss # Embedding block - h, e = self.embed_block(z, rel_pos, edge_attr, data.tags) + h, e = self.embed_block(z, rel_pos, edge_attr, data.tags, rel_pos_normalized) # Compute atom weights for late energy head if self.energy_head == "weighted-av-initial-embeds": From aaf07e6d9c4114bfcc747c65be98a8066c6a96d8 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 10 Jan 2023 08:29:53 -0500 Subject: [PATCH 048/273] test edge embed and mp type for is2re --- configs/exps/gnn/edge_embed_type.yaml | 65 ++++++++++++++ configs/exps/gnn/mp_type.yaml | 122 ++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 configs/exps/gnn/edge_embed_type.yaml create mode 100644 configs/exps/gnn/mp_type.yaml diff --git a/configs/exps/gnn/edge_embed_type.yaml b/configs/exps/gnn/edge_embed_type.yaml new file mode 100644 index 0000000000..e64fd7bc3f --- /dev/null +++ b/configs/exps/gnn/edge_embed_type.yaml @@ -0,0 +1,65 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + wandb_tags: 'edge-embed-test' + optim: + max_epochs: 5 + batch_size: 256 + eval_batch_size: 256 + +runs: + - config: sfarinet-is2re-all + note: 'Sfarinet no sym' + - config: sfarinet-is2re-all + note: 'Sfarinet baseline sym' + frame_averaging: 2D + fa_frames: se3-random + - config: sfarinet-is2re-all + note: 'Sfarinet baseline sym' + frame_averaging: 2D + fa_frames: se3-random + - config: sfarinet-is2re-all + note: 'rij' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: rij + - config: sfarinet-is2re-all + note: 'sh' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: sh + - config: sfarinet-is2re-all + note: 'all rij' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all_rij + - config: sfarinet-is2re-all + note: 'all' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all + - config: sfarinet-is2re-all + note: 'all' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all + optim: + lr_initial: 0.0007 \ No newline at end of file diff --git a/configs/exps/gnn/mp_type.yaml b/configs/exps/gnn/mp_type.yaml new file mode 100644 index 0000000000..47d44133ba --- /dev/null +++ b/configs/exps/gnn/mp_type.yaml @@ -0,0 +1,122 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + edge_embed_type: all_rij + wandb_tags: 'mp-type' + optim: + max_epochs: 5 + batch_size: 256 + eval_batch_size: 256 + +runs: + - config: fanet-is2re-all + note: 'fanet no sym' + - config: fanet-is2re-all + note: 'fanet baseline sym' + frame_averaging: 2D + fa_frames: se3-random + - config: fanet-is2re-all + note: 'fanet baseline sym' + frame_averaging: 2D + fa_frames: se3-random + - config: fanet-is2re-all + note: 'simple' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: simple + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + - config: fanet-is2re-all + note: 'updownscale' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + lr_initial: 0.0005 + - config: fanet-is2re-all + note: 'all' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: + optim: + lr_initial: 0.0001 + mp_type: base + - config: fanet-is2re-all + note: 'smaller lr' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale + optim: + lr_initial: 0.0005 + - config: fanet-is2re-all + note: 'small warmup factor and lr' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + lr_initial: 0.0005 + warmup_factor: 0.01 + - config: fanet-is2re-all + note: 'warmup factor + lr' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + lr_initial: 0.0005 + warmup_factor: 0.5 + - config: fanet-is2re-all + note: 'big batch size' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + batch_size: 328 + eval_batch_size: 328 + - config: fanet-is2re-all + note: 'smaller batch' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + batch_size: 180 + eval_batch_size: 180 + - config: fanet-is2re-all + note: 'smaller hidden smaller lr' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + hidden_channels: 180 + optim: + lr_initial: 0.0005 \ No newline at end of file From b83b142b04696060c084447611370ef0a03e80aa Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 10 Jan 2023 11:03:50 -0500 Subject: [PATCH 049/273] add local env (x2), sfarinet and base_with_att GNN extensions --- ocpmodels/models/fanet.py | 63 +++++++--- ocpmodels/models/utils/attention_model.py | 137 ++++++++++++++++++++++ scripts/gnn_dev.py | 6 +- 3 files changed, 187 insertions(+), 19 deletions(-) create mode 100644 ocpmodels/models/utils/attention_model.py diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 37a58f61f2..2279b41317 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -1,7 +1,5 @@ """ Code of the Scalable Frame Averaging (Rotation Invariant) GNN """ -import math - import torch from e3nn.o3 import spherical_harmonics from torch import nn @@ -13,6 +11,7 @@ from ocpmodels.common.utils import conditional_grad, get_pbc_distances from ocpmodels.models.base_model import BaseModel from ocpmodels.models.force_decoder import ForceDecoder +from ocpmodels.models.utils.attention_model import AttConv from ocpmodels.models.utils.pos_encodings import PositionalEncoding from ocpmodels.modules.phys_embeddings import PhysEmbedding from ocpmodels.modules.pooling import Graclus, Hierarchical_Pooling @@ -237,21 +236,25 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): super(InteractionBlock, self).__init__() self.act = act self.mp_type = mp_type + self.hidden_channels = hidden_channels if self.mp_type == "simple": self.lin_geom = nn.Linear(num_filters, hidden_channels) self.lin_h = nn.Linear(hidden_channels, hidden_channels) + elif self.mp_type == "sfarinet": + self.lin_h = nn.Linear(hidden_channels, hidden_channels) + elif self.mp_type == "updownscale": - self.lin_geom = nn.Linear(num_filters + 2 * hidden_channels, num_filters) - # self.lin_geom = nn.Linear(num_filters, num_filters) # like 'simple' + # self.lin_geom = nn.Linear(num_filters + 2 * hidden_channels, num_filters) + self.lin_geom = nn.Linear(num_filters, num_filters) # like 'simple' self.lin_down = nn.Linear(hidden_channels, num_filters) self.lin_up = nn.Linear(num_filters, hidden_channels) elif self.mp_type == "base_with_att": # --- Compute attention coefficients if required -- - # Change message function - pass + self.lin_h = nn.Linear(hidden_channels, hidden_channels) + self.lin_geom = AttConv(hidden_channels, heads=1, concat=True, bias=True) elif self.mp_type == "att": # --- Compute attention coefficients if required -- @@ -259,7 +262,12 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): pass elif self.mp_type == "local_env": - pass + self.lin_geom = nn.Linear(num_filters, hidden_channels) + self.lin_h = nn.Linear(hidden_channels, hidden_channels) + + elif self.mp_type == "up_down_local_env": + self.lin_h = nn.Linear(hidden_channels, num_filters) + self.lin_geom = nn.Linear(2 * num_filters, hidden_channels) else: # base self.lin_geom = nn.Linear( @@ -268,8 +276,9 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): self.lin_h = nn.Linear(hidden_channels, hidden_channels) def reset_parameters(self): - nn.init.xavier_uniform_(self.lin_geom.weight) - self.lin_geom.bias.data.fill_(0) + if self.mp_type != "sfarinet": + nn.init.xavier_uniform_(self.lin_geom.weight) + self.lin_geom.bias.data.fill_(0) nn.init.xavier_uniform_(self.lin_h.weight) self.lin_h.bias.data.fill_(0) if self.mp_type == "updownscale": @@ -280,32 +289,54 @@ def reset_parameters(self): def forward(self, h, edge_index, e): - if self.mp_type != "simple": + if self.mp_type in {"base"}: e = torch.cat([e, h[edge_index[0]], h[edge_index[1]]], dim=1) # W = self.lin_e_2(self.act(self.lin_e_1(e))) # transform edge rep - W = self.lin_geom(e) + if self.mp_type in {"up_down_local_env", "sfarinet", "base_with_att"}: + W = e + else: + W = self.lin_geom(e) if self.mp_type == "updownscale": h = self.lin_down(h) # downscale node rep. h = self.propagate(edge_index, x=h, W=W) # propagate h = self.lin_up(self.act(h)) # upscale node rep. + elif self.mp_type == "att": # Look at So3krates code pass elif self.mp_type == "base_with_att": - # Combine above and base - pass + h = self.lin_h(self.act(h)) + h = self.lin_geom(h, edge_index, W) + elif self.mp_type == "local_env": - pass + h = self.lin_h(self.act(h)) + chi = self.propagate(edge_index, x=h, W=W, local_env=True) # propagate + h = self.propagate(edge_index, x=h, W=W) # propagate + h = h + chi + # h = h * chi + elif self.mp_type == "up_down_local_env": + h = self.lin_h(self.act(h)) + chi = self.propagate(edge_index, x=h, W=W, local_env=True) # propagate + h = self.propagate(edge_index, x=h, W=W) # propagate + h = torch.cat((h, chi), dim=1) + h = self.lin_geom(h) + else: # base, simple h = self.lin_h(self.act(h)) h = self.propagate(edge_index, x=h, W=W) # propagate return h - def message(self, x_j, W): - return x_j * W + def message(self, x_j, W, local_env=None, att=None): + if local_env is not None: + return W + elif att is not None: + # Compute alpha_i + return alpha_i * x_j * W + else: + return x_j * W class OutputBlock(nn.Module): diff --git a/ocpmodels/models/utils/attention_model.py b/ocpmodels/models/utils/attention_model.py new file mode 100644 index 0000000000..4d10561de1 --- /dev/null +++ b/ocpmodels/models/utils/attention_model.py @@ -0,0 +1,137 @@ +from typing import Optional, Tuple, Union + +import torch +from torch import Tensor +from torch.nn import Parameter +from torch_geometric.nn.conv import MessagePassing +from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size +from torch_sparse import SparseTensor + +from ..inits import glorot, zeros + + +class AttConv(MessagePassing): + r"""The graph attentional operator from the `"Graph Attention Networks" + `_ paper + + Args: + hidden_channels (int): Size of each input sample, or :obj:`-1` to + derive the size from the first input(s) to the forward method. + A tuple corresponds to the sizes of source and target + dimensionalities. + heads (int, optional): Number of multi-head-attentions. + (default: :obj:`1`) + concat (bool, optional): If set to :obj:`False`, the multi-head + attentions are averaged instead of concatenated. + (default: :obj:`True`) + bias (bool, optional): If set to :obj:`False`, the layer will not learn + an additive bias. (default: :obj:`True`) + **kwargs (optional): Additional arguments of + :class:`torch_geometric.nn.conv.MessagePassing`. + + Shapes: + - **input:** + node features :math:`(|\mathcal{V}|, F_{in})` or + :math:`((|\mathcal{V_s}|, F_{s}), (|\mathcal{V_t}|, F_{t}))` + if bipartite, + edge indices :math:`(2, |\mathcal{E}|)`, + edge features :math:`(|\mathcal{E}|, D)` *(optional)* + - **output:** node features :math:`(|\mathcal{V}|, H * F_{out})` or + :math:`((|\mathcal{V}_t|, H * F_{out})` if bipartite. + If :obj:`return_attention_weights=True`, then + :math:`((|\mathcal{V}|, H * F_{out}), + ((2, |\mathcal{E}|), (|\mathcal{E}|, H)))` + or :math:`((|\mathcal{V_t}|, H * F_{out}), ((2, |\mathcal{E}|), + (|\mathcal{E}|, H)))` if bipartite + """ + + def __init__( + self, + hidden_channels: int, + heads: int = 1, + concat: bool = True, + bias: bool = True, + **kwargs, + ): + kwargs.setdefault("aggr", "add") + super().__init__(node_dim=0, **kwargs) + + self.hidden_channels = hidden_channels + self.heads = heads + self.concat = concat + + # The learnable parameters to compute attention coefficients: + self.att_src = Parameter(torch.Tensor(1, heads, hidden_channels)) + self.att_dst = Parameter(torch.Tensor(1, heads, hidden_channels)) + + if bias and concat: + self.bias = Parameter(torch.Tensor(heads * hidden_channels)) + elif bias and not concat: + self.bias = Parameter(torch.Tensor(hidden_channels)) + else: + self.register_parameter("bias", None) + + self.reset_parameters() + + def reset_parameters(self): + glorot(self.att_src) + glorot(self.att_dst) + zeros(self.bias) + + def forward( + self, + x: Union[Tensor, OptPairTensor], + edge_index: Adj, + edge_attr: OptTensor = None, + size: Size = None, + return_attention_weights=None, + ): + r""" + Args: + return_attention_weights (bool, optional): If set to :obj:`True`, + will additionally return the tuple + :obj:`(edge_index, attention_weights)`, holding the computed + attention weights for each edge. (default: :obj:`None`) + """ + # NOTE: attention weights will be returned whenever + # `return_attention_weights` is set to a value, regardless of its + # actual value (might be `True` or `False`). + + x.view(-1, self.heads, self.hidden_channels) + + # Next, we compute node-level attention coefficients, both for source + # and target nodes (if present): + alpha = (x * self.att_src).sum(dim=-1) + + # edge_updater_type: (alpha: OptPairTensor, edge_attr: OptTensor) + alpha = self.edge_updater(edge_index, alpha=alpha, edge_attr=edge_attr) + + # propagate_type: (x: OptPairTensor, alpha: Tensor) + out = self.propagate( + edge_index, x=x, alpha=alpha, edge_attr=edge_attr, size=size + ) + + if self.concat: + out = out.view(-1, self.heads * self.hidden_channels) + else: + out = out.mean(dim=1) + + if self.bias is not None: + out = out + self.bias + + if isinstance(return_attention_weights, bool): + if isinstance(edge_index, Tensor): + return out, (edge_index, alpha) + elif isinstance(edge_index, SparseTensor): + return out, edge_index.set_value(alpha, layout="coo") + else: + return out + + def message(self, x_j: Tensor, alpha: Tensor, edge_attr: Tensor) -> Tensor: + return alpha.unsqueeze(-1) * x_j * edge_attr + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}({self.in_channels}, " + f"{self.hidden_channels}, heads={self.heads})" + ) diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py index d2e114c655..40dc41a18f 100644 --- a/scripts/gnn_dev.py +++ b/scripts/gnn_dev.py @@ -24,7 +24,7 @@ config["optim"] = {"max_epochs": 0} config["model"] = {"use_pbc": True} config["model"]["edge_embed_type"] = "rij" - config["model"]["mp_type"] = "base" + # config["model"]["mp_type"] = "base" checkpoint_path = None # "checkpoints/2022-04-28-11-42-56-dimenetplusplus/" + "best_checkpoint.pt" @@ -32,8 +32,8 @@ str_args = sys.argv[1:] if all("config" not in arg for arg in str_args): str_args.append("--is_debug") - # str_args.append("--config=fanet-is2re-10k") - str_args.append("--config=sfarinet-s2ef-2M") + str_args.append("--config=sfarinet-is2re-10k") + # str_args.append("--config=sfarinet-s2ef-2M") warnings.warn( "No model / mode is given; chosen as default" + f"Using: {str_args[-1]}" ) From 538c9f7262a904d07b3f1c5f442e833fa9e7d938 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 11:14:25 -0500 Subject: [PATCH 050/273] tags a tuple --- ocpmodels/common/logger.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index 1e329d59f0..8f47c038ae 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -149,6 +149,7 @@ def mark_preempting(self): def add_tags(self, tags): if not isinstance(tags, list): tags = [tags] + tags = tuple(tags) self.run.tags = self.run.tags + tags def collect_output_files(self, policy="now"): From d2f3e3f7345cbaa5914222dfcaac1253b96a0888 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 12:02:21 -0500 Subject: [PATCH 051/273] fix `observe` signature --- main.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 03c4401b38..b4add00fb6 100644 --- a/main.py +++ b/main.py @@ -119,7 +119,10 @@ def run(self, orion_exp=None): # print("objective post-broadcast: ", objective) if orion_exp is not None: - orion_exp.observe(orion_trial, objective, name="energy_mae") + orion_exp.observe( + orion_trial, + {"type": "objective", "name": "energy_mae", "value": objective}, + ) if __name__ == "__main__": From 02e2d5602de4aeca29b32dcac7640f8189889928 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 10 Jan 2023 13:02:14 -0500 Subject: [PATCH 052/273] two attention mechanisms --- ocpmodels/models/fanet.py | 27 +++++++++++++---------- ocpmodels/models/utils/attention_model.py | 17 +++++++++----- ocpmodels/trainers/base_trainer.py | 7 ++++-- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 2279b41317..6d893b2a26 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -4,7 +4,7 @@ from e3nn.o3 import spherical_harmonics from torch import nn from torch.nn import Embedding, Linear -from torch_geometric.nn import MessagePassing, radius_graph +from torch_geometric.nn import MessagePassing, TransformerConv, radius_graph from torch_scatter import scatter from ocpmodels.common.registry import registry @@ -258,8 +258,15 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): elif self.mp_type == "att": # --- Compute attention coefficients if required -- - # Change message function - pass + self.lin_h = nn.Linear(hidden_channels, hidden_channels) + self.lin_geom = TransformerConv( + hidden_channels, + hidden_channels, + heads=1, + concat=True, + root_weight=False, + edge_dim=num_filters, + ) elif self.mp_type == "local_env": self.lin_geom = nn.Linear(num_filters, hidden_channels) @@ -293,7 +300,7 @@ def forward(self, h, edge_index, e): e = torch.cat([e, h[edge_index[0]], h[edge_index[1]]], dim=1) # W = self.lin_e_2(self.act(self.lin_e_1(e))) # transform edge rep - if self.mp_type in {"up_down_local_env", "sfarinet", "base_with_att"}: + if self.mp_type in {"up_down_local_env", "sfarinet", "base_with_att", "att"}: W = e else: W = self.lin_geom(e) @@ -304,12 +311,11 @@ def forward(self, h, edge_index, e): h = self.lin_up(self.act(h)) # upscale node rep. elif self.mp_type == "att": - # Look at So3krates code - pass + h = self.lin_h(self.act(h)) + h = self.lin_geom(h, edge_index, edge_attr=W) elif self.mp_type == "base_with_att": h = self.lin_h(self.act(h)) - h = self.lin_geom(h, edge_index, W) - + h = self.lin_geom(h, edge_index, W) # propagate is inside elif self.mp_type == "local_env": h = self.lin_h(self.act(h)) chi = self.propagate(edge_index, x=h, W=W, local_env=True) # propagate @@ -329,12 +335,9 @@ def forward(self, h, edge_index, e): return h - def message(self, x_j, W, local_env=None, att=None): + def message(self, x_j, W, local_env=None): if local_env is not None: return W - elif att is not None: - # Compute alpha_i - return alpha_i * x_j * W else: return x_j * W diff --git a/ocpmodels/models/utils/attention_model.py b/ocpmodels/models/utils/attention_model.py index 4d10561de1..b280680f51 100644 --- a/ocpmodels/models/utils/attention_model.py +++ b/ocpmodels/models/utils/attention_model.py @@ -1,14 +1,15 @@ -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch +import torch.nn.functional as F from torch import Tensor from torch.nn import Parameter from torch_geometric.nn.conv import MessagePassing +from torch_geometric.nn.inits import glorot, zeros from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size +from torch_geometric.utils import softmax from torch_sparse import SparseTensor -from ..inits import glorot, zeros - class AttConv(MessagePassing): r"""The graph attentional operator from the `"Graph Attention Networks" @@ -103,12 +104,11 @@ def forward( # and target nodes (if present): alpha = (x * self.att_src).sum(dim=-1) - # edge_updater_type: (alpha: OptPairTensor, edge_attr: OptTensor) - alpha = self.edge_updater(edge_index, alpha=alpha, edge_attr=edge_attr) + alpha = self.edge_updater(edge_index, alpha=(alpha, None)) # propagate_type: (x: OptPairTensor, alpha: Tensor) out = self.propagate( - edge_index, x=x, alpha=alpha, edge_attr=edge_attr, size=size + edge_index, x=x, alpha=alpha, size=size, edge_attr=edge_attr ) if self.concat: @@ -127,6 +127,11 @@ def forward( else: return out + def edge_update(self, alpha_j: Tensor, alpha_i: OptTensor, index: Tensor) -> Tensor: + alpha_j = F.leaky_relu(alpha_j) + alpha_j = softmax(alpha_j, index) + return alpha_j + def message(self, x_j: Tensor, alpha: Tensor, edge_attr: Tensor) -> Tensor: return alpha.unsqueeze(-1) * x_j * edge_attr diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 72bfea076d..7d3f3d0c86 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -44,7 +44,7 @@ ) from ocpmodels.modules.loss import DDPLoss, L2MAELoss from ocpmodels.modules.normalizer import Normalizer -from ocpmodels.modules.scheduler import LRScheduler, EarlyStopper +from ocpmodels.modules.scheduler import EarlyStopper, LRScheduler @registry.register_trainer("base") @@ -75,7 +75,10 @@ def __init__(self, **kwargs): self.test_ri = self.config["test_ri"] self.is_debug = self.config["is_debug"] self.is_hpo = self.config["is_hpo"] - self.eval_on_test = self.config["eval_on_test"] + if self.task_name == "qm9": + self.eval_on_test = self.config["eval_on_test"] + else: + self.eval_on_test = False self.silent = self.config["silent"] self.datasets = {} self.samplers = {} From 449fe5a1940f957021dfeb63c514f91e0be37257 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 14:15:40 -0500 Subject: [PATCH 053/273] orion trial result is a list --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b4add00fb6..d4ed02d42d 100644 --- a/main.py +++ b/main.py @@ -121,7 +121,7 @@ def run(self, orion_exp=None): if orion_exp is not None: orion_exp.observe( orion_trial, - {"type": "objective", "name": "energy_mae", "value": objective}, + [{"type": "objective", "name": "energy_mae", "value": objective}], ) From 85ea173f249f1715b1f8fdef91bb459427c6aff0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 17:22:21 -0500 Subject: [PATCH 054/273] resume wandb if resume from Orion --- main.py | 2 +- ocpmodels/common/logger.py | 32 +++++++++++++++++++------------- ocpmodels/common/utils.py | 21 ++++++++++++++------- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/main.py b/main.py index d4ed02d42d..a98b1dcce6 100644 --- a/main.py +++ b/main.py @@ -87,7 +87,7 @@ def run(self, orion_exp=None): # print("hparams post-broadcast: ", hparams) assert should_be_0 == 0 if self.hparams: - print("Received hyper-parameters from Orion:") + print("\n💎 Received hyper-parameters from Orion:") print(self.hparams) self.trainer_config = merge_dicts(self.trainer_config, self.hparams) diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index 8f47c038ae..dab33affcc 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -90,27 +90,33 @@ def mark_preempting(self): class WandBLogger(Logger): def __init__(self, trainer_config): super().__init__(trainer_config) + wandb_tags = note = name = None - wandb_id = str(self.trainer_config.get("wandb_id", "")) - if wandb_id: - wandb_id += " - " - slurm_jobid = os.environ.get("SLURM_JOB_ID") - if slurm_jobid: - wandb_id += f"{slurm_jobid}-" - wandb_id += self.trainer_config["config"] - - wandb_tags = trainer_config.get("wandb_tags", "") - if wandb_tags: - wandb_tags = [t.strip() for t in wandb_tags[:63].split(",")] + if trainer_config.get("wandb_resume_id"): + wandb_id = trainer_config["wandb_resume_id"] + else: + wandb_id = str(self.trainer_config.get("wandb_id", "")) + if wandb_id: + wandb_id += " - " + slurm_jobid = os.environ.get("SLURM_JOB_ID") + if slurm_jobid: + wandb_id += f"{slurm_jobid}-" + wandb_id += self.trainer_config["config"] + + wandb_tags = trainer_config.get("wandb_tags", "") + if wandb_tags: + wandb_tags = [t.strip() for t in wandb_tags[:63].split(",")] + note = self.trainer_config.get("note", "") + name = self.trainer_config["wandb_name"] or wandb_id self.run = wandb.init( config=self.trainer_config, id=wandb_id, - name=self.trainer_config["wandb_name"] or wandb_id, + name=name, dir=self.trainer_config["logs_dir"], project=self.trainer_config["wandb_project"], resume="allow", - notes=self.trainer_config.get("note", ""), + notes=note, tags=wandb_tags, entity="mila-ocp", ) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 35a63ceead..1de5556b1d 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -82,18 +82,25 @@ def continue_orion_exp(trainer_config): if not latest_dirs: return trainer_config - latest_ckpts = sorted( - [f for f in (latest_dirs[-1] / "checkpoints").glob("checkpoint-*")], + resume_dir = latest_dirs[-1] + + resume_ckpts = sorted( + [f for f in (resume_dir / "checkpoints").glob("checkpoint-*")], key=lambda f: float(f.stem.split("-")[-1]), ) - if not latest_ckpts: - raise ValueError(f"No checkpoint found in {str(latest_dirs[-1])}") - trainer_config["checkpoint"] = str(latest_ckpts[-1]) + if not resume_ckpts: + raise ValueError(f"No checkpoint found in {str(resume_dir)}") + trainer_config["checkpoint"] = str(resume_ckpts[-1]) + resume_url = (resume_dir / "wandb_url.txt").read_text() + trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1] + print( - f"\nFound {len(latest_ckpts)} existing Orion runs.", + f"\n🎁 Found {len(resume_ckpts)} existing Orion runs.", "Resuming from latest:", - str(latest_dirs[-1]), + str(resume_dir), + "\nOn wandb run:", + resume_url, ) print("Based on unique file id:", id_file) print("Continuing from checkpoint:", trainer_config["checkpoint"], end="\n\n") From fc493fc636fc4bfad45ac1e9aa9cdd587ceefb50 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 17:25:43 -0500 Subject: [PATCH 055/273] scheduler selection more robust --- ocpmodels/modules/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index 0d993b3925..7c5c01ac1a 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -26,7 +26,7 @@ def __init__(self, optimizer, optim_config): self.optimizer = optimizer self.optim_config = optim_config.copy() self.warmup_scheduler = None - if "scheduler" in self.optim_config: + if self.optim_config.get("scheduler"): self.scheduler_type = self.optim_config["scheduler"] else: self.scheduler_type = "LambdaLR" From 97334932ff240bf961fda099902fa2397bca4d40 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 18:39:35 -0500 Subject: [PATCH 056/273] improve display --- main.py | 6 +++--- ocpmodels/trainers/base_trainer.py | 12 +++++++----- ocpmodels/trainers/single_trainer.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index a98b1dcce6..9570e809c2 100644 --- a/main.py +++ b/main.py @@ -15,7 +15,7 @@ import torch from orion.client import build_experiment -from yaml import safe_load +from yaml import safe_load, dump from ocpmodels.common import distutils from ocpmodels.common.flags import flags @@ -87,8 +87,8 @@ def run(self, orion_exp=None): # print("hparams post-broadcast: ", hparams) assert should_be_0 == 0 if self.hparams: - print("\n💎 Received hyper-parameters from Orion:") - print(self.hparams) + print("\n💎💎Received hyper-parameters from Orion:") + print(dump(self.hparams), end="\n💎💎\n") self.trainer_config = merge_dicts(self.trainer_config, self.hparams) self.trainer_config = continue_orion_exp(self.trainer_config) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 7d3f3d0c86..e410d3d885 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -139,7 +139,8 @@ def __init__(self, **kwargs): self.hpo_checkpoint_every = self.config["optim"].get("checkpoint_every", -1) if distutils.is_master() and not self.silent: - print(yaml.dump(self.config, default_flow_style=False)) + print("🧰 Trainer config:") + print(yaml.dump(self.config), end="\n\n") self.load() self.evaluator = Evaluator( @@ -287,7 +288,8 @@ def load_model(self): # Build model if distutils.is_master() and not self.silent: logging.info( - f"Loading model {self.config['model_name']}: {self.config['model']}" + f"Loading model {self.config['model_name']}:" + + f" {yaml.dump(self.config['model'])}" ) bond_feat_dim = None @@ -314,8 +316,8 @@ def load_model(self): f"{self.model.num_params} parameters." ) - if self.logger is not None: - self.logger.watch(self.model) + # if self.logger is not None: + # self.logger.watch(self.model) self.model = OCPDataParallel( self.model, @@ -543,7 +545,7 @@ def validate( ): if distutils.is_master() and not self.silent: print() - logging.info(f"Evaluating on {split}.") + logging.info(f"🧐 Evaluating on {split}.") if self.is_hpo: disable_tqdm = True diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index ec30642ca4..adf460f72f 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -210,7 +210,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): model_run_time = 0 if not self.silent: - print("---Beginning of Training---") + print(f"--- 🔄 Beginning of Training @ {self.now}---") for epoch_int in range(start_epoch, self.config["optim"]["max_epochs"]): From 70c7efdb7188e5a4f161fdc3a8878090476d2cf9 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 18:42:10 -0500 Subject: [PATCH 057/273] it's ok not to find checkpoints --- ocpmodels/common/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 1de5556b1d..95fca5b602 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -90,7 +90,9 @@ def continue_orion_exp(trainer_config): ) if not resume_ckpts: - raise ValueError(f"No checkpoint found in {str(resume_dir)}") + print(f"🥶 Warning: No checkpoint found in {str(resume_dir)}. Not resuming.") + return trainer_config + trainer_config["checkpoint"] = str(resume_ckpts[-1]) resume_url = (resume_dir / "wandb_url.txt").read_text() trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1] From 2974e7d0828c9fe7f87e5a4d5630dfd3182bafd9 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 10 Jan 2023 18:50:11 -0500 Subject: [PATCH 058/273] handle keyboard interrupt --- launch_exp.py | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index 34999ce607..c7109bfdef 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -143,6 +143,7 @@ def cli_arg(args, key=""): if __name__ == "__main__": + is_interrupted = False args = resolved_args() assert "exp" in args regex = args.get("match", ".*") @@ -215,10 +216,13 @@ def cli_arg(args, key=""): confirm = input("\n🚦 Confirm? [y/n]") if confirm == "y": - outputs = [ - print(f"Launching job {c:3}", end="\r") or os.popen(command).read().strip() - for c, command in enumerate(commands) - ] + try: + outputs = [] + for c, command in enumerate(commands): + print(f"Launching job {c:3}", end="\r") + outputs.append(os.popen(command).read().strip()) + except KeyboardInterrupt: + is_interrupted = True outdir = ROOT / "data" / "exp_outputs" / exp_name outfile = outdir / f"{exp_name.split('/')[-1]}_{ts}.txt" outfile.parent.mkdir(exist_ok=True, parents=True) @@ -228,14 +232,19 @@ def cli_arg(args, key=""): for line in text.splitlines() if (sep := "Submitted batch job ") in line ] - text += f"{separator}All jobs launched: {' '.join(jobs)}" - with outfile.open("w") as f: - f.write(text) - print(f"Output written to {str(outfile)}") - print(util_strings(jobs)) - yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs) - print( - "Experiment summary YAML in ", f"./{str(yml_out.relative_to(Path.cwd()))}" - ) + + if is_interrupted: + print("\n💀 Interrupted. Kill jobs with:\n$ scancel" + " ".join(jobs)) + else: + text += f"{separator}All jobs launched: {' '.join(jobs)}" + with outfile.open("w") as f: + f.write(text) + print(f"Output written to {str(outfile)}") + print(util_strings(jobs)) + yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs) + print( + "Experiment summary YAML in ", + f"./{str(yml_out.relative_to(Path.cwd()))}", + ) else: print("Aborting") From 97018bb8f502a6d3b5a769edb2ca406baad2054e Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 11 Jan 2023 16:43:40 -0500 Subject: [PATCH 059/273] working orion implementation --- configs/exps/debug/orion.yaml | 37 ++++++++++++++++------------ launch_exp.py | 37 +++++++++++++++++++--------- main.py | 33 +++++++------------------ ocpmodels/common/flags.py | 2 +- ocpmodels/common/utils.py | 34 +++++++++++++++++++++++++ ocpmodels/trainers/base_trainer.py | 6 ++--- ocpmodels/trainers/single_trainer.py | 6 ++++- 7 files changed, 98 insertions(+), 57 deletions(-) diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml index 4a02537aa9..93eb5f5de4 100644 --- a/configs/exps/debug/orion.yaml +++ b/configs/exps/debug/orion.yaml @@ -1,8 +1,8 @@ # more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij job: - mem: 32GB + mem: 24GB cpus: 4 - gres: gpu:1 + gres: gpu:16gb:1 time: 1:00:00 partition: main code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab @@ -32,7 +32,6 @@ default: decay_rate: 0.05 # at the end of training, lr is decay_rate*lr_initial # max_epochs = ref_steps[3e6] / (n_train[110 000] / ref_batch_size[32]) max_epochs: -1 - max_steps: 3000000 note: model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels optim: batch_size, lr_initial @@ -40,16 +39,22 @@ default: orion: # Remember to change the experiment name if you change anything in the search space - _meta_: - n_runs: 2 - unique_exp_name: ocp-qm9-orion-debug-v0.0.2 - optim: - batch_size: uniform(32, 1024, discrete=True) - lr_initial: loguniform(1e-5, 5e-3, precision=2) - max_steps: fidelity(1e4, 1e6, base=5e5) - model: - num_gaussians: uniform(16, 200, discrete=True) - hidden_channels: uniform(32, 512, discrete=True) - num_filters: uniform(32, 512, discrete=True) - num_interactions: uniform(1, 7, discrete=True) - phys_embeds: choices([True, False]) \ No newline at end of file + n_jobs: 20 + + unique_exp_name: ocp-qm9-orion-debug-v1.0.0 + + space: + optim/max_steps: fidelity(1e5, 1e6, base=3) + optim/batch_size: uniform(32, 128, discrete=True) + optim/lr_initial: loguniform(1e-5, 5e-3, precision=2) + model/num_gaussians: uniform(16, 200, discrete=True) + model/hidden_channels: uniform(32, 512, discrete=True) + model/num_filters: uniform(32, 512, discrete=True) + model/num_interactions: uniform(1, 7, discrete=True) + model/phys_embeds: choices([True, False]) + + algorithms: + asha: + seed: 123 + num_rungs: 5 + num_brackets: 1 diff --git a/launch_exp.py b/launch_exp.py index c7109bfdef..e40199df7a 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -142,6 +142,22 @@ def cli_arg(args, key=""): return s +def get_args_or_exp(key, args, exp): + value = None + if key in args: + if key in exp: + print(f"Overriding orion.{key} from the command-line") + value = args[key] + elif key in exp: + value = exp[key] + else: + raise ValueError( + f"Must specify 'orion.{key}' " + + f"in exp file or from the command-line `{key}=value`" + ) + return value + + if __name__ == "__main__": is_interrupted = False args = resolved_args() @@ -157,24 +173,21 @@ def cli_arg(args, key=""): if "orion" in exp: orion_base = ROOT / "data" / "orion" assert "runs" not in exp, "Cannot use both Orion and runs" - meta = exp["orion"].pop("_meta_", {}) - assert ( - "unique_exp_name" in meta - ), "Must specify 'orion._meta_.unique_exp_name' in exp file" - assert "n_runs" in meta, "Must specify 'orion._meta_.n_runs' in exp file" - - search_path = ( - orion_base / "search-spaces" / f"{ts}-{meta['unique_exp_name']}.yaml" - ) + + n_jobs = get_args_or_exp("n_jobs", args, exp["orion"]) + unique_exp_name = get_args_or_exp("unique_exp_name", args, exp["orion"]) + if "unique_exp_name" not in exp: + exp["unique_exp_name"] = unique_exp_name + + search_path = orion_base / "search-spaces" / f"{ts}-{unique_exp_name}.yaml" search_path.parent.mkdir(exist_ok=True, parents=True) assert not search_path.exists() search_path.write_text(dump(exp["orion"])) runs = [ { - "orion_search_path": str(search_path), - "orion_unique_exp_name": meta["unique_exp_name"], + "orion_exp_config_path": str(search_path), } - for _ in range(meta["n_runs"]) + for _ in range(n_jobs) ] else: runs = exp["runs"] diff --git a/main.py b/main.py index 9570e809c2..5d2558beb1 100644 --- a/main.py +++ b/main.py @@ -11,27 +11,26 @@ import time import traceback import warnings -from pathlib import Path import torch -from orion.client import build_experiment -from yaml import safe_load, dump +from yaml import dump from ocpmodels.common import distutils from ocpmodels.common.flags import flags from ocpmodels.common.registry import registry from ocpmodels.common.utils import ( JOB_ID, - ROOT, build_config, continue_from_slurm_job_id, continue_orion_exp, + load_orion_exp, merge_dicts, move_lmdb_data_to_slurm_tmpdir, read_slurm_env, resolve, setup_imports, setup_logging, + unflatten_dict, update_from_sbatch_py_vars, ) from ocpmodels.trainers import BaseTrainer @@ -76,7 +75,7 @@ def run(self, orion_exp=None): if distutils.is_master(): if orion_exp: orion_trial = orion_exp.suggest(1) - self.hparams = orion_trial.params + self.hparams = unflatten_dict(orion_trial.params, sep="/") self.hparams["orion_hash_params"] = orion_trial.hash_params should_be_0 = distutils.get_rank() @@ -87,8 +86,8 @@ def run(self, orion_exp=None): # print("hparams post-broadcast: ", hparams) assert should_be_0 == 0 if self.hparams: - print("\n💎💎Received hyper-parameters from Orion:") - print(dump(self.hparams), end="\n💎💎\n") + print("\n💎 Received hyper-parameters from Orion:") + print(dump(self.hparams), end="\n") self.trainer_config = merge_dicts(self.trainer_config, self.hparams) self.trainer_config = continue_orion_exp(self.trainer_config) @@ -169,23 +168,9 @@ def run(self, orion_exp=None): # ------------------- # ----- Train ----- # ------------------- - if args.orion_search_path and distutils.is_master(): - assert args.orion_unique_exp_name - space = safe_load(Path(args.orion_search_path).read_text()) - print("Search Space: ", space) - experiment = build_experiment( - storage={ - "database": { - "host": str( - ROOT / "data" / "orion" / "storage" / "orion_db.pkl" - ), - "type": "pickleddb", - } - }, - name=args.orion_unique_exp_name, - space=space, - algorithms={"asha": {"seed": 123}}, - ) + if args.orion_exp_config_path and distutils.is_master(): + experiment = load_orion_exp(args) + print("\nStarting runner.") runner.run(orion_exp=experiment) else: print("Starting runner.") diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index e1d19fbb5f..30024baa19 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -240,7 +240,7 @@ def add_core_args(self): help="Log training loss every n steps", ) self.parser.add_argument( - "--orion_search_path", + "--orion_exp_config_path", "-o", type=str, help="Path to an orion search space yaml file", diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 95fca5b602..0baaeec890 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -30,6 +30,7 @@ import yaml from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas from matplotlib.figure import Figure +from orion.client import build_experiment from torch_geometric.data import Data from torch_geometric.utils import remove_self_loops from torch_scatter import segment_coo, segment_csr @@ -43,6 +44,28 @@ JOB_ID = os.environ.get("SLURM_JOB_ID") +def load_orion_exp(args): + exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text()) + + assert args.orion_unique_exp_name or exp_config.get( + "unique_exp_name" + ), "Must provide orion_unique_exp_name in the command-line or the config file." + + print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}") + experiment = build_experiment( + storage={ + "database": { + "host": str(ROOT / "data" / "orion" / "storage" / "orion_db.pkl"), + "type": "pickleddb", + } + }, + name=args.orion_unique_exp_name or exp_config["unique_exp_name"], + space=exp_config["space"], + algorithms=exp_config["algorithms"], + ) + return experiment + + def continue_orion_exp(trainer_config): if not trainer_config.get("orion_search_path") or not trainer_config.get( "orion_unique_exp_name" @@ -793,6 +816,17 @@ def create_dict_from_args(args: list, sep: str = "."): return return_dict +def unflatten_dict(source, sep="."): + """ + >>> d = {"a.b": 4, "a.c": 5, "r.y": 1} + >>> unflatten_dict(d) + {'a': {'b': 4, 'c': 5}, 'r': {'y': 1}} + """ + target = {} + [dict_set_recursively(target, k.split(sep), v) for k, v in source.items()] + return target + + def load_config_legacy(path: str, previous_includes: list = []): path = Path(path) if path in previous_includes: diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index e410d3d885..b5a7b71001 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -286,9 +286,9 @@ def load_task(self): def load_model(self): # Build model - if distutils.is_master() and not self.silent: - logging.info( - f"Loading model {self.config['model_name']}:" + if not self.silent: + print( + f"🧠 Loading model {self.config['model_name']}:\n" + f" {yaml.dump(self.config['model'])}" ) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index adf460f72f..d76133fee9 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -48,7 +48,7 @@ def now(self): def load_task(self): if not self.silent: - logging.info(f"Loading dataset: {self.config['task']['dataset']}") + print(f"Loading dataset: {self.config['task']['dataset']}") self.num_targets = 1 # start imports from @@ -80,6 +80,10 @@ def load_task(self): device=self.device, ) else: + print( + "Warning: grad_target_mean not found in normalizer but", + "regress_forces and normalize_labels are true.", + ) self.normalizers["grad_target"] = Normalizer( tensor=self.datasets["train"].data.y[ self.datasets["train"].__indices__ From a0d5aa7312f29d0a0f1efa42caa74745e0542a40 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Wed, 11 Jan 2023 17:51:39 -0500 Subject: [PATCH 060/273] update config and fanet modif --- configs/exps/gnn/edge_embed_type.yaml | 29 +--- configs/exps/gnn/edge_embed_type_s2ef.yaml | 65 +++++++++ configs/exps/gnn/mp_type_2.yaml | 133 ++++++++++++++++++ .../exps/prop-check/symmetries_s2ef_2.yaml | 45 +----- .../exps/prop-check/symmetries_s2ef_3.yaml | 92 ++++++++++++ configs/models/fanet.yaml | 19 +-- configs/models/sfarinet.yaml | 17 ++- ocpmodels/models/fanet.py | 34 +++-- scripts/gnn_dev.py | 6 +- 9 files changed, 334 insertions(+), 106 deletions(-) create mode 100644 configs/exps/gnn/edge_embed_type_s2ef.yaml create mode 100644 configs/exps/gnn/mp_type_2.yaml create mode 100644 configs/exps/prop-check/symmetries_s2ef_3.yaml diff --git a/configs/exps/gnn/edge_embed_type.yaml b/configs/exps/gnn/edge_embed_type.yaml index e64fd7bc3f..b7a0418fe2 100644 --- a/configs/exps/gnn/edge_embed_type.yaml +++ b/configs/exps/gnn/edge_embed_type.yaml @@ -16,33 +16,16 @@ default: energy_head: 'weighted-av-initial-embeds' # False ? wandb_tags: 'edge-embed-test' optim: - max_epochs: 5 + max_epochs: 15 batch_size: 256 eval_batch_size: 256 + cp_data_to_tmpdir: true runs: - - config: sfarinet-is2re-all - note: 'Sfarinet no sym' - - config: sfarinet-is2re-all - note: 'Sfarinet baseline sym' - frame_averaging: 2D - fa_frames: se3-random - config: sfarinet-is2re-all note: 'Sfarinet baseline sym' frame_averaging: 2D fa_frames: se3-random - - config: sfarinet-is2re-all - note: 'rij' - frame_averaging: 2D - fa_frames: se3-random - model: - edge_embed_type: rij - - config: sfarinet-is2re-all - note: 'sh' - frame_averaging: 2D - fa_frames: se3-random - model: - edge_embed_type: sh - config: sfarinet-is2re-all note: 'all rij' frame_averaging: 2D @@ -55,11 +38,3 @@ runs: fa_frames: se3-random model: edge_embed_type: all - - config: sfarinet-is2re-all - note: 'all' - frame_averaging: 2D - fa_frames: se3-random - model: - edge_embed_type: all - optim: - lr_initial: 0.0007 \ No newline at end of file diff --git a/configs/exps/gnn/edge_embed_type_s2ef.yaml b/configs/exps/gnn/edge_embed_type_s2ef.yaml new file mode 100644 index 0000000000..5ad120c07d --- /dev/null +++ b/configs/exps/gnn/edge_embed_type_s2ef.yaml @@ -0,0 +1,65 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 30:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + cp_data_to_tmpdir: true + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + regress_forces: direct_with_gradient_target + wandb_tags: 's2ef-archi-tests' + optim: + max_epochs: 5 + batch_size: 192 + eval_batch_size: 192 + +runs: + - config: sfarinet-s2ef-2M + note: 'Sfarinet no sym' + - config: sfarinet-s2ef-2M + note: 'Sfarinet baseline sym' + frame_averaging: 2D + fa_frames: se3-random + - config: sfarinet-s2ef-2M + note: 'Sfarinet baseline sym' + frame_averaging: 2D + fa_frames: se3-random + - config: sfarinet-s2ef-2M + note: 'rij' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: rij + - config: sfarinet-s2ef-2M + note: 'sh' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: sh + - config: sfarinet-s2ef-2M + note: 'all rij' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all_rij + - config: sfarinet-s2ef-2M + note: 'all' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all + - config: sfarinet-s2ef-2M + note: 'all' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all diff --git a/configs/exps/gnn/mp_type_2.yaml b/configs/exps/gnn/mp_type_2.yaml new file mode 100644 index 0000000000..c91945a03d --- /dev/null +++ b/configs/exps/gnn/mp_type_2.yaml @@ -0,0 +1,133 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + edge_embed_type: all_rij + wandb_tags: 'mp-type' + optim: + max_epochs: 5 + batch_size: 256 + eval_batch_size: 256 + cp_data_to_tmpdir: true + +runs: + - config: fanet-is2re-all + note: 'fanet baseline sym' + frame_averaging: 2D + fa_frames: se3-random + - config: fanet-is2re-all + note: 'simple' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: simple + - config: fanet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + - config: fanet-is2re-all + note: 'att' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: att + - config: fanet-is2re-all + note: 'local_env' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: local_env + - config: fanet-is2re-all + note: 'up_down_local_env' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: up_down_local_env + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + - config: fanet-is2re-all + note: 'base_with_att' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base_with_att + - config: fanet-is2re-all + note: 'updownscale' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + batch_size: 128 + eval_batch_size: 128 + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + batch_size: 64 + eval_batch_size: 64 + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + batch_size: 300 + eval_batch_size: 300 + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + warmup_factor: 0.05 + - config: fanet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + optim: + warmup_steps: 4000 + - config: fanet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + optim: + lr_gamma: 0.4 + max_epochs: 20 + - config: sfarinet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml index aebe1b7934..9f6ae35be2 100644 --- a/configs/exps/prop-check/symmetries_s2ef_2.yaml +++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml @@ -33,56 +33,15 @@ runs: fa_frames: all model: regress_forces: from_energy - - - config: sfarinet-s2ef-2M + - config: sfarinet-s2ef-2M # 2659788 note: '2D all gradient' frame_averaging: 2D fa_frames: all model: regress_forces: direct_with_gradient_target - - config: sfarinet-s2ef-2M + - config: sfarinet-s2ef-2M # 2659789 note: '2d all no gradient' frame_averaging: 2D fa_frames: all model: regress_forces: direct - - - config: sfarinet-s2ef-2M - note: 'Big energy grad coef' - frame_averaging: 2D - fa_frames: all - model: - regress_forces: direct_with_gradient_target - optim: - energy_grad_coefficient: 100 - force_coefficient: 30 - energy_coefficient: 1 - - config: sfarinet-s2ef-2M - note: 'Big energy grad coef' - frame_averaging: 2D - fa_frames: random - model: - regress_forces: direct_with_gradient_target - optim: - energy_grad_coefficient: 100 - force_coefficient: 30 - energy_coefficient: 1 - - config: sfarinet-s2ef-2M - note: 'No energy coef' - frame_averaging: 2D - fa_frames: random - model: - regress_forces: direct_with_gradient_target - optim: - energy_grad_coefficient: 100 - force_coefficient: 30 - energy_coefficient: 0 - - config: sfarinet-s2ef-2M - note: 'Large force coef' - frame_averaging: 2D - fa_frames: random - model: - regress_forces: direct_with_gradient_target - optim: - force_coefficient: 75 - energy_coefficient: 1 \ No newline at end of file diff --git a/configs/exps/prop-check/symmetries_s2ef_3.yaml b/configs/exps/prop-check/symmetries_s2ef_3.yaml new file mode 100644 index 0000000000..2605009336 --- /dev/null +++ b/configs/exps/prop-check/symmetries_s2ef_3.yaml @@ -0,0 +1,92 @@ +job: + mem: 48GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: False # False ? + optim: + max_epochs: 5 + batch_size: 196 + eval_batch_size: 196 + wandb_tags: 's2ef-sym-prop' + cp_data_to_tmp_dir: True + +runs: + - config: sfarinet-s2ef-2M + note: 'Baseline 5 epochs 1 Gpu' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: from_energy + - config: sfarinet-s2ef-2M + note: 'Baseline 5 epochs 1 Gpu' + frame_averaging: 3D + fa_frames: all + model: + regress_forces: from_energy + + + + + - config: sfarinet-s2ef-2M + note: '2D all gradient' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: direct_with_gradient_target + - config: sfarinet-s2ef-2M + note: '2d all no gradient' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: direct + + - config: sfarinet-s2ef-2M + note: 'Big energy grad coef' + frame_averaging: 2D + fa_frames: all + model: + regress_forces: direct_with_gradient_target + optim: + energy_grad_coefficient: 100 + force_coefficient: 30 + energy_coefficient: 1 + - config: sfarinet-s2ef-2M + note: 'Big energy grad coef' + frame_averaging: 2D + fa_frames: random + model: + regress_forces: direct_with_gradient_target + optim: + energy_grad_coefficient: 100 + force_coefficient: 30 + energy_coefficient: 1 + - config: sfarinet-s2ef-2M + note: 'No energy coef' + frame_averaging: 2D + fa_frames: random + model: + regress_forces: direct_with_gradient_target + optim: + energy_grad_coefficient: 100 + force_coefficient: 30 + energy_coefficient: 0 + - config: sfarinet-s2ef-2M + note: 'Large force coef' + frame_averaging: 2D + fa_frames: random + model: + regress_forces: direct_with_gradient_target + optim: + force_coefficient: 75 + energy_coefficient: 1 diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml index b04d7dfba7..7426140a2b 100644 --- a/configs/models/fanet.yaml +++ b/configs/models/fanet.yaml @@ -73,19 +73,21 @@ is2re: hidden_channels: 384 num_interactions: 4 optim: + batch_size: 256 + eval_batch_size: 256 lr_initial: 0.001 lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma - 18000 - 27000 - 37000 - warmup_steps: 5394 + warmup_steps: 6000 max_epochs: 20 # ------------------ # ----- S2EF ----- # ------------------ -# For 4 GPUs +# For 2 GPUs s2ef: default: @@ -98,14 +100,13 @@ s2ef: force_coefficient: 30 energy_grad_coefficient: 10 optim: - batch_size: 48 - eval_batch_size: 48 - warmup_steps: 25000 + batch_size: 96 + eval_batch_size: 96 warmup_factor: 0.2 lr_gamma: 0.1 lr_initial: 0.0001 max_epochs: 15 - warmup_steps: 20000 + warmup_steps: 30000 lr_milestones: - 55000 - 75000 @@ -113,7 +114,7 @@ s2ef: 200k: {} - # 2 gpus + # 1 gpus 2M: model: num_interactions: 5 @@ -121,8 +122,8 @@ s2ef: num_gaussians: 200 num_filters: 256 optim: - batch_size: 96 - eval_batch_size: 96 + batch_size: 192 + eval_batch_size: 192 20M: {} diff --git a/configs/models/sfarinet.yaml b/configs/models/sfarinet.yaml index 0e2d82d993..f078c948df 100644 --- a/configs/models/sfarinet.yaml +++ b/configs/models/sfarinet.yaml @@ -83,14 +83,14 @@ is2re: - 17981 - 26972 - 35963 - warmup_steps: 5394 + warmup_steps: 6000 max_epochs: 20 # ------------------ # ----- S2EF ----- # ------------------ -# For 4 GPUs +# For 1 GPUs s2ef: default: @@ -103,18 +103,17 @@ s2ef: force_coefficient: 30 energy_grad_coefficient: 10 optim: - batch_size: 48 - eval_batch_size: 48 - warmup_steps: 25000 + batch_size: 192 + eval_batch_size: 192 + warmup_steps: 30000 warmup_factor: 0.2 lr_gamma: 0.1 lr_initial: 0.0002 max_epochs: 20 - warmup_steps: 20000 lr_milestones: - - 50000 - - 70000 - - 90000 + - 55000 + - 80000 + - 105000 200k: {} diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 6d893b2a26..ce059fef70 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -114,15 +114,14 @@ def __init__( if self.edge_embed_type == "rij": self.lin_e1 = Linear(3, num_filters) elif self.edge_embed_type == "all_rij": - self.lin_e1 = Linear(3, num_filters // 3) # r_ij - self.lin_e12 = Linear(3, num_filters // 3) # norm r_ij - self.lin_e13 = Linear( - num_gaussians, num_filters - 2 * (num_filters // 3) + self.lin_e1 = Linear(3, num_filters // 2) # r_ij + self.lin_e12 = Linear( + num_gaussians, num_filters - (num_filters // 2) ) # d_ij elif self.edge_embed_type == "sh": self.lin_e1 = Linear(15, num_filters) elif self.edge_embed_type == "all": - self.lin_e1 = Linear(18, num_filters) + self.lin_e1 = Linear(15, num_filters) else: raise ValueError("edge_embedding_type does not exist") @@ -152,8 +151,6 @@ def reset_parameters(self): if self.edge_embed_type == "all_rij": nn.init.xavier_uniform_(self.lin_e12.weight) self.lin_e12.bias.data.fill_(0) - nn.init.xavier_uniform_(self.lin_e13.weight) - self.lin_e13.bias.data.fill_(0) def forward( self, z, rel_pos, edge_attr, tag=None, normalised_rel_pos=None, subnodes=None @@ -165,9 +162,8 @@ def forward( e = self.lin_e1(rel_pos) elif self.edge_embed_type == "all_rij": rel_pos = self.lin_e1(rel_pos) # r_ij - normalized_rel_pos = self.lin_e12(normalised_rel_pos) # norm r_ij - edge_attr = self.lin_e13(edge_attr) # d_ij - e = torch.cat((rel_pos, edge_attr, normalized_rel_pos), dim=1) + edge_attr = self.lin_e12(edge_attr) # d_ij + e = torch.cat((rel_pos, edge_attr), dim=1) elif self.edge_embed_type == "sh": self.sh = spherical_harmonics( l=[1, 2, 3], @@ -187,8 +183,8 @@ def forward( e = self.lin_e1(e) if self.second_layer_MLP: - e = self.lin_e2(e) - # e = self.lin_e2(self.act(e)) + # e = self.lin_e2(e) + e = self.lin_e2(self.act(e)) # --- Node embedding -- @@ -254,7 +250,14 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): elif self.mp_type == "base_with_att": # --- Compute attention coefficients if required -- self.lin_h = nn.Linear(hidden_channels, hidden_channels) - self.lin_geom = AttConv(hidden_channels, heads=1, concat=True, bias=True) + # self.lin_geom = AttConv(hidden_channels, heads=1, concat=True, bias=True) + self.lin_geom = TransformerConv( + hidden_channels, + hidden_channels, + heads=1, + concat=True, + root_weight=False, + ) elif self.mp_type == "att": # --- Compute attention coefficients if required -- @@ -296,7 +299,7 @@ def reset_parameters(self): def forward(self, h, edge_index, e): - if self.mp_type in {"base"}: + if self.mp_type == "base": e = torch.cat([e, h[edge_index[0]], h[edge_index[1]]], dim=1) # W = self.lin_e_2(self.act(self.lin_e_1(e))) # transform edge rep @@ -316,6 +319,7 @@ def forward(self, h, edge_index, e): elif self.mp_type == "base_with_att": h = self.lin_h(self.act(h)) h = self.lin_geom(h, edge_index, W) # propagate is inside + elif self.mp_type == "local_env": h = self.lin_h(self.act(h)) chi = self.propagate(edge_index, x=h, W=W, local_env=True) # propagate @@ -329,7 +333,7 @@ def forward(self, h, edge_index, e): h = torch.cat((h, chi), dim=1) h = self.lin_geom(h) - else: # base, simple + else: # base, simple, sfarinet h = self.lin_h(self.act(h)) h = self.propagate(edge_index, x=h, W=W) # propagate diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py index 40dc41a18f..2797921988 100644 --- a/scripts/gnn_dev.py +++ b/scripts/gnn_dev.py @@ -21,10 +21,10 @@ config["frame_averaging"] = "2D" config["fa_frames"] = "random" # "random" config["test_ri"] = True - config["optim"] = {"max_epochs": 0} + config["optim"] = {"max_epochs": 1} config["model"] = {"use_pbc": True} config["model"]["edge_embed_type"] = "rij" - # config["model"]["mp_type"] = "base" + config["model"]["mp_type"] = "att" checkpoint_path = None # "checkpoints/2022-04-28-11-42-56-dimenetplusplus/" + "best_checkpoint.pt" @@ -32,7 +32,7 @@ str_args = sys.argv[1:] if all("config" not in arg for arg in str_args): str_args.append("--is_debug") - str_args.append("--config=sfarinet-is2re-10k") + str_args.append("--config=fanet-is2re-10k") # str_args.append("--config=sfarinet-s2ef-2M") warnings.warn( "No model / mode is given; chosen as default" + f"Using: {str_args[-1]}" From 35d1f9a4f9c34435bfa3ea08aae8b33083927131 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Wed, 11 Jan 2023 17:59:29 -0500 Subject: [PATCH 061/273] fix sfarinet --- ocpmodels/models/fanet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index ce059fef70..6f262c1563 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -460,6 +460,8 @@ def __init__(self, **kwargs): self.max_num_neighbors = kwargs["max_num_neighbors"] self.edge_embed_type = kwargs["edge_embed_type"] self.skip_co = kwargs["skip_co"] + if kwargs["mp_type"] == 'sfarinet': + kwargs["num_filters"] = kwargs["hidden_channels"] self.act = ( getattr(nn.functional, kwargs["act"]) if kwargs["act"] != "swish" else swish From c0deb8fa94825bf041072d18f2dbbef3c19ae29a Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 11 Jan 2023 18:10:26 -0500 Subject: [PATCH 062/273] store `orion_unique_exp_name` --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 5d2558beb1..7c282f303c 100644 --- a/main.py +++ b/main.py @@ -77,6 +77,7 @@ def run(self, orion_exp=None): orion_trial = orion_exp.suggest(1) self.hparams = unflatten_dict(orion_trial.params, sep="/") self.hparams["orion_hash_params"] = orion_trial.hash_params + self.hparams["orion_unique_exp_name"] = orion_exp.name should_be_0 = distutils.get_rank() hp_list = [self.hparams, should_be_0] From 3d868ec65700d4e3a655abb4bf0669a492d60154 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 11 Jan 2023 18:13:31 -0500 Subject: [PATCH 063/273] `auto_note` after orion sampling --- main.py | 2 ++ ocpmodels/common/utils.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 7c282f303c..73d75f5e15 100644 --- a/main.py +++ b/main.py @@ -20,6 +20,7 @@ from ocpmodels.common.registry import registry from ocpmodels.common.utils import ( JOB_ID, + auto_note, build_config, continue_from_slurm_job_id, continue_orion_exp, @@ -92,6 +93,7 @@ def run(self, orion_exp=None): self.trainer_config = merge_dicts(self.trainer_config, self.hparams) self.trainer_config = continue_orion_exp(self.trainer_config) + self.trainer_config = auto_note(self.trainer_config) cls = registry.get_trainer_class(self.trainer_config["trainer"]) self.trainer: BaseTrainer = cls(**self.trainer_config) task = registry.get_task_class(self.trainer_config["mode"])(self.trainer_config) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 0baaeec890..3fd421ea68 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -941,7 +941,6 @@ def build_config(args, args_override): config = set_qm9_target_stats(config) config = set_qm7x_target_stats(config) config = override_narval_paths(config) - config = auto_note(config) if not config["no_cpus_to_workers"]: cpus = count_cpus() From 49415929137514b47a134e86c27bc75748241c53 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 11 Jan 2023 18:40:56 -0500 Subject: [PATCH 064/273] fix `forces_grad_target` propagation --- ocpmodels/trainers/single_trainer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index d76133fee9..5c26c974fc 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -430,12 +430,13 @@ def model_forward(self, batch_list): .view(-1, 3) ) f_all.append(g_forces) + breakpoint() batch_list[0].pos = original_pos if self.task_name in OCP_TASKS: batch_list[0].cell = original_cell # Average predictions over frames - preds = {"energy": sum(e_all) / len(e_all)} + preds["energy"] = sum(e_all) / len(e_all) if len(p_all) > 0 and all(y is not None for y in p_all): preds["pooling_loss"] = sum(p_all) / len(p_all) if len(f_all) > 0 and all(y is not None for y in f_all): From 84b7b7a3f66cb535a96551ff57f11ec0ce6c8f07 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 11 Jan 2023 18:55:35 -0500 Subject: [PATCH 065/273] trailing breakpoinut --- ocpmodels/trainers/single_trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 5c26c974fc..0c7109bc25 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -430,7 +430,6 @@ def model_forward(self, batch_list): .view(-1, 3) ) f_all.append(g_forces) - breakpoint() batch_list[0].pos = original_pos if self.task_name in OCP_TASKS: batch_list[0].cell = original_cell From 788bf52f3e65c1be1e8026cbda4e661ab8b48886 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 11 Jan 2023 20:05:33 -0500 Subject: [PATCH 066/273] fix `continue_orion_exp` --- ocpmodels/common/utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 3fd421ea68..7945a15121 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -67,16 +67,14 @@ def load_orion_exp(args): def continue_orion_exp(trainer_config): - if not trainer_config.get("orion_search_path") or not trainer_config.get( - "orion_unique_exp_name" - ): + if not trainer_config.get("orion_exp_config_path"): return trainer_config if "orion_hash_params" not in trainer_config: faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml" print( - "\n\nWARNING: trainer_config has 'orion_search_path' and", - "'orion_unique_exp_name' but no 'orion_hash_params'.", + "\n\nWARNING: trainer_config has 'orion_exp_config_path'", + "but no 'orion_hash_params'.", "This can lead to inconsistencies.", f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n", ) From 0fb4f50288311a8294cb87135884e57732195f90 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 12 Jan 2023 10:26:11 -0500 Subject: [PATCH 067/273] validate GNN MP type extensions --- configs/exps/gnn/mp_type_2.yaml | 96 ++++++------ configs/exps/gnn/test_activ_linear.yaml | 128 +++++++++++++++ ocpmodels/models/fanet.py | 95 ++++++----- ocpmodels/models/utils/attention_model.py | 183 +++++++++++++++++++++- 4 files changed, 414 insertions(+), 88 deletions(-) create mode 100644 configs/exps/gnn/test_activ_linear.yaml diff --git a/configs/exps/gnn/mp_type_2.yaml b/configs/exps/gnn/mp_type_2.yaml index c91945a03d..f19ceceaf1 100644 --- a/configs/exps/gnn/mp_type_2.yaml +++ b/configs/exps/gnn/mp_type_2.yaml @@ -24,57 +24,47 @@ default: runs: - config: fanet-is2re-all - note: 'fanet baseline sym' + note: 'simple' frame_averaging: 2D fa_frames: se3-random + model: + second_layer_mlp: true + mp_type: simple - config: fanet-is2re-all note: 'simple' frame_averaging: 2D fa_frames: se3-random model: + skip_co: true mp_type: simple + + - config: fanet-is2re-all note: 'sfarinet' frame_averaging: 2D fa_frames: se3-random model: mp_type: sfarinet + optim: + max_epochs: 20 - config: fanet-is2re-all - note: 'att' - frame_averaging: 2D - fa_frames: se3-random - model: - mp_type: att - - config: fanet-is2re-all - note: 'local_env' - frame_averaging: 2D - fa_frames: se3-random - model: - mp_type: local_env - - config: fanet-is2re-all - note: 'up_down_local_env' + note: 'sfarinet' frame_averaging: 2D fa_frames: se3-random model: - mp_type: up_down_local_env + mp_type: sfarinet + optim: + max_epochs: 20 - config: fanet-is2re-all note: 'base' frame_averaging: 2D fa_frames: se3-random model: mp_type: base - - config: fanet-is2re-all - note: 'base_with_att' - frame_averaging: 2D - fa_frames: se3-random - model: - mp_type: base_with_att - - config: fanet-is2re-all - note: 'updownscale' - frame_averaging: 2D - fa_frames: se3-random - model: - mp_type: updownscale + optim: + warmup_factor: 0.5 + lr_initial: 0.003 + max_epochs: 10 - config: fanet-is2re-all note: 'base' frame_averaging: 2D @@ -82,8 +72,9 @@ runs: model: mp_type: base optim: - batch_size: 128 - eval_batch_size: 128 + warmup_factor: 0.1 + lr_initial: 0.0005 + max_epochs: 10 - config: fanet-is2re-all note: 'base' frame_averaging: 2D @@ -93,41 +84,52 @@ runs: optim: batch_size: 64 eval_batch_size: 64 + max_epochs: 10 - config: fanet-is2re-all - note: 'base' + note: 'att' frame_averaging: 2D fa_frames: se3-random model: - mp_type: base + mp_type: att optim: - batch_size: 300 - eval_batch_size: 300 + lr_initial: 0.0005 + + - config: fanet-is2re-all - note: 'base' + note: 'local_env' frame_averaging: 2D fa_frames: se3-random model: - mp_type: base - optim: - warmup_factor: 0.05 + mp_type: local_env - config: fanet-is2re-all - note: 'sfarinet' + note: 'up_down_local_env' frame_averaging: 2D fa_frames: se3-random model: - mp_type: sfarinet - optim: - warmup_steps: 4000 + mp_type: up_down_local_env - config: fanet-is2re-all - note: 'sfarinet' + note: 'up_down_local_env 2 layer' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: up_down_local_env + second_layer_mlp: true + - config: fanet-is2re-all + note: 'sfarinet 2 layer' frame_averaging: 2D fa_frames: se3-random model: mp_type: sfarinet - optim: - lr_gamma: 0.4 - max_epochs: 20 - - config: sfarinet-is2re-all - note: 'sfarinet' + second_layer_mlp: true + - config: fanet-is2re-all + note: 'base_with_att' frame_averaging: 2D fa_frames: se3-random + model: + mp_type: base_with_att + - config: fanet-is2re-all + note: 'updownscale' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale diff --git a/configs/exps/gnn/test_activ_linear.yaml b/configs/exps/gnn/test_activ_linear.yaml new file mode 100644 index 0000000000..dbf890165a --- /dev/null +++ b/configs/exps/gnn/test_activ_linear.yaml @@ -0,0 +1,128 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 # shall have been 32 + energy_head: 'weighted-av-initial-embeds' # False ? + edge_embed_type: all_rij + wandb_tags: 'mp-type' + optim: + max_epochs: 5 + batch_size: 256 + eval_batch_size: 256 + cp_data_to_tmpdir: true + +runs: + - config: fanet-is2re-all + note: 'sfarinet reverted linear activ' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + - config: sfarinet-is2re-all + note: 'base sfarinet' + frame_averaging: 2D + fa_frames: se3-random + + - config: fanet-is2re-all + note: 'att' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: att + - config: fanet-is2re-all + note: 'local_env' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: local_env + - config: fanet-is2re-all + note: 'up_down_local_env' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: up_down_local_env + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + - config: fanet-is2re-all + note: 'base_with_att' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base_with_att + - config: fanet-is2re-all + note: 'updownscale' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + batch_size: 128 + eval_batch_size: 128 + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + batch_size: 64 + eval_batch_size: 64 + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + batch_size: 300 + eval_batch_size: 300 + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + optim: + warmup_factor: 0.05 + - config: fanet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + optim: + warmup_steps: 4000 + - config: fanet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + optim: + lr_gamma: 0.4 + max_epochs: 20 + - config: sfarinet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 6f262c1563..5390027b97 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -11,7 +11,7 @@ from ocpmodels.common.utils import conditional_grad, get_pbc_distances from ocpmodels.models.base_model import BaseModel from ocpmodels.models.force_decoder import ForceDecoder -from ocpmodels.models.utils.attention_model import AttConv +from ocpmodels.models.utils.attention_model import TransfoAttConv from ocpmodels.models.utils.pos_encodings import PositionalEncoding from ocpmodels.modules.phys_embeddings import PhysEmbedding from ocpmodels.modules.pooling import Graclus, Hierarchical_Pooling @@ -182,9 +182,11 @@ def forward( e = torch.cat((rel_pos, self.sh), dim=1) e = self.lin_e1(e) + e = self.act(e) # can comment out + if self.second_layer_MLP: # e = self.lin_e2(e) - e = self.lin_e2(self.act(e)) + e = self.act(self.lin_e2(e)) # --- Node embedding -- @@ -220,9 +222,9 @@ def forward( h += h_pos # MLP - h = self.lin(h) + h = self.act(self.lin(h)) if self.second_layer_MLP: - h = self.lin_2(self.act(h)) + h = self.act(self.lin_2(h)) return h, e @@ -242,25 +244,27 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): self.lin_h = nn.Linear(hidden_channels, hidden_channels) elif self.mp_type == "updownscale": - # self.lin_geom = nn.Linear(num_filters + 2 * hidden_channels, num_filters) self.lin_geom = nn.Linear(num_filters, num_filters) # like 'simple' self.lin_down = nn.Linear(hidden_channels, num_filters) self.lin_up = nn.Linear(num_filters, hidden_channels) + elif self.mp_type == "updownscale_base": + self.lin_geom = nn.Linear(num_filters + 2 * hidden_channels, num_filters) + self.lin_down = nn.Linear(hidden_channels, num_filters) + self.lin_up = nn.Linear(num_filters, hidden_channels) + elif self.mp_type == "base_with_att": - # --- Compute attention coefficients if required -- self.lin_h = nn.Linear(hidden_channels, hidden_channels) # self.lin_geom = AttConv(hidden_channels, heads=1, concat=True, bias=True) - self.lin_geom = TransformerConv( + self.lin_geom = TransfoAttConv( hidden_channels, hidden_channels, heads=1, concat=True, root_weight=False, + edge_dim=num_filters, ) - elif self.mp_type == "att": - # --- Compute attention coefficients if required -- self.lin_h = nn.Linear(hidden_channels, hidden_channels) self.lin_geom = TransformerConv( hidden_channels, @@ -275,9 +279,10 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): self.lin_geom = nn.Linear(num_filters, hidden_channels) self.lin_h = nn.Linear(hidden_channels, hidden_channels) - elif self.mp_type == "up_down_local_env": - self.lin_h = nn.Linear(hidden_channels, num_filters) - self.lin_geom = nn.Linear(2 * num_filters, hidden_channels) + elif self.mp_type == "updown_local_env": + self.lin_down = nn.Linear(hidden_channels, num_filters) + self.lin_geom = nn.Linear(num_filters, num_filters) + self.lin_up = nn.Linear(2 * num_filters, hidden_channels) else: # base self.lin_geom = nn.Linear( @@ -299,43 +304,54 @@ def reset_parameters(self): def forward(self, h, edge_index, e): - if self.mp_type == "base": + # Define edge embedding + if self.mp_type in {"base", "updownscale_base"}: e = torch.cat([e, h[edge_index[0]], h[edge_index[1]]], dim=1) - # W = self.lin_e_2(self.act(self.lin_e_1(e))) # transform edge rep - if self.mp_type in {"up_down_local_env", "sfarinet", "base_with_att", "att"}: - W = e - else: - W = self.lin_geom(e) + if self.mp_type in { + "simple", + "updownscale", + "base", + "updownscale_base", + "local_env", + }: + e = self.act(self.lin_geom(e)) # TODO: remove act() ? - if self.mp_type == "updownscale": - h = self.lin_down(h) # downscale node rep. - h = self.propagate(edge_index, x=h, W=W) # propagate - h = self.lin_up(self.act(h)) # upscale node rep. + # --- Message Passing block -- + + if self.mp_type == "updownscale" or self.mp_type == "updownscale_base": + h = self.act(self.lin_down(h)) # downscale node rep. + h = self.propagate(edge_index, x=h, W=e) # propagate + h = self.act(self.lin_up(h)) # upscale node rep. elif self.mp_type == "att": - h = self.lin_h(self.act(h)) - h = self.lin_geom(h, edge_index, edge_attr=W) + h = self.lin_geom(h, edge_index, edge_attr=e) + h = self.act(self.lin_h(h)) + elif self.mp_type == "base_with_att": - h = self.lin_h(self.act(h)) - h = self.lin_geom(h, edge_index, W) # propagate is inside + h = self.lin_geom(h, edge_index, edge_attr=e) # propagate is inside + h = self.act(self.lin_h(h)) elif self.mp_type == "local_env": - h = self.lin_h(self.act(h)) - chi = self.propagate(edge_index, x=h, W=W, local_env=True) # propagate - h = self.propagate(edge_index, x=h, W=W) # propagate + chi = self.propagate(edge_index, x=h, W=e, local_env=True) + h = self.propagate(edge_index, x=h, W=e) # propagate h = h + chi - # h = h * chi - elif self.mp_type == "up_down_local_env": - h = self.lin_h(self.act(h)) - chi = self.propagate(edge_index, x=h, W=W, local_env=True) # propagate - h = self.propagate(edge_index, x=h, W=W) # propagate + h = h = self.act(self.lin_h(h)) + + elif self.mp_type == "updown_local_env": + h = self.act(self.lin_down(h)) + chi = self.propagate(edge_index, x=h, W=e, local_env=True) + e = self.lin_geom(e) + h = self.propagate(edge_index, x=h, W=e) # propagate h = torch.cat((h, chi), dim=1) - h = self.lin_geom(h) + h = self.lin_up(h) - else: # base, simple, sfarinet - h = self.lin_h(self.act(h)) - h = self.propagate(edge_index, x=h, W=W) # propagate + elif self.mp_type in {"base", "simple", "sfarinet"}: + h = self.propagate(edge_index, x=h, W=e) # propagate + h = self.act(self.lin_h(h)) + + else: + raise ValueError("mp_type provided does not exist") return h @@ -447,6 +463,7 @@ class FANet(BaseModel): could be num_filters or hidden_channels. mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}): specificies the MP of the interaction block. + batch_norm (bool): whether to apply batch norm after every linear layer. """ def __init__(self, **kwargs): @@ -460,7 +477,7 @@ def __init__(self, **kwargs): self.max_num_neighbors = kwargs["max_num_neighbors"] self.edge_embed_type = kwargs["edge_embed_type"] self.skip_co = kwargs["skip_co"] - if kwargs["mp_type"] == 'sfarinet': + if kwargs["mp_type"] == "sfarinet": kwargs["num_filters"] = kwargs["hidden_channels"] self.act = ( diff --git a/ocpmodels/models/utils/attention_model.py b/ocpmodels/models/utils/attention_model.py index b280680f51..6ada3eebb1 100644 --- a/ocpmodels/models/utils/attention_model.py +++ b/ocpmodels/models/utils/attention_model.py @@ -1,12 +1,20 @@ -from typing import Optional, Union +import math +from typing import Optional, Tuple, Union import torch import torch.nn.functional as F from torch import Tensor from torch.nn import Parameter from torch_geometric.nn.conv import MessagePassing +from torch_geometric.nn.dense.linear import Linear from torch_geometric.nn.inits import glorot, zeros -from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size +from torch_geometric.typing import ( + Adj, + OptPairTensor, + OptTensor, + PairTensor, + Size, +) from torch_geometric.utils import softmax from torch_sparse import SparseTensor @@ -140,3 +148,174 @@ def __repr__(self) -> str: f"{self.__class__.__name__}({self.in_channels}, " f"{self.hidden_channels}, heads={self.heads})" ) + + +class TransfoAttConv(MessagePassing): + r"""The graph transformer operator from the `"Masked Label Prediction: + Unified Message Passing Model for Semi-Supervised Classification" + `_ paper + + Args: + in_channels (int or tuple): Size of each input sample, or :obj:`-1` to + derive the size from the first input(s) to the forward method. + A tuple corresponds to the sizes of source and target + dimensionalities. + out_channels (int): Size of each output sample. + heads (int, optional): Number of multi-head-attentions. + (default: :obj:`1`) + concat (bool, optional): If set to :obj:`False`, the multi-head + attentions are averaged instead of concatenated. + (default: :obj:`True`) + beta (bool, optional): If set, will combine aggregation and + skip information via + dropout (float, optional): Dropout probability of the normalized + attention coefficients which exposes each node to a stochastically + sampled neighborhood during training. (default: :obj:`0`) + edge_dim (int, optional): Edge feature dimensionality (in case + there are any). Edge features are convoled with value features prior + multiplication by attention coefficient + bias (bool, optional): If set to :obj:`False`, the layer will not learn + an additive bias. (default: :obj:`True`) + root_weight (bool, optional): If set to :obj:`False`, the layer will + not add the transformed root node features to the output and the + option :attr:`beta` is set to :obj:`False`. (default: :obj:`True`) + **kwargs (optional): Additional arguments of + :class:`torch_geometric.nn.conv.MessagePassing`. + """ + _alpha: OptTensor + + def __init__( + self, + in_channels: Union[int, Tuple[int, int]], + out_channels: int, + heads: int = 1, + concat: bool = True, + dropout: float = 0.0, + edge_dim: Optional[int] = None, + bias: bool = True, + root_weight: bool = True, + **kwargs, + ): + kwargs.setdefault("aggr", "add") + super(TransfoAttConv, self).__init__(node_dim=0, **kwargs) + + self.in_channels = in_channels + self.out_channels = out_channels + self.heads = heads + self.root_weight = root_weight + self.concat = concat + self.dropout = dropout + self.edge_dim = edge_dim + self._alpha = None + + if isinstance(in_channels, int): + in_channels = (in_channels, in_channels) + + self.lin_key = Linear(in_channels[0], heads * out_channels) + self.lin_query = Linear(in_channels[1], heads * out_channels) + self.lin_value = Linear(in_channels[0], heads * out_channels) + if edge_dim is not None: + self.lin_edge = Linear(edge_dim, heads * out_channels, bias=False) + else: + self.lin_edge = self.register_parameter("lin_edge", None) + + if concat: + self.lin_skip = Linear(in_channels[1], heads * out_channels, bias=bias) + else: + self.lin_skip = Linear(in_channels[1], out_channels, bias=bias) + + self.reset_parameters() + + def reset_parameters(self): + self.lin_key.reset_parameters() + self.lin_query.reset_parameters() + self.lin_value.reset_parameters() + if self.edge_dim: + self.lin_edge.reset_parameters() + self.lin_skip.reset_parameters() + + def forward( + self, + x: Union[Tensor, PairTensor], + edge_index: Adj, + edge_attr: OptTensor = None, + return_attention_weights=None, + ): + r""" + Args: + return_attention_weights (bool, optional): If set to :obj:`True`, + will additionally return the tuple + :obj:`(edge_index, attention_weights)`, holding the computed + attention weights for each edge. (default: :obj:`None`) + """ + + H, C = self.heads, self.out_channels + + if isinstance(x, Tensor): + x: PairTensor = (x, x) + + query = self.lin_query(x[1]).view(-1, H, C) + key = self.lin_key(x[0]).view(-1, H, C) + value = self.lin_value(x[0]).view(-1, H, C) + + # propagate_type: (query: Tensor, key:Tensor, value: Tensor, edge_attr: OptTensor) # noqa + out = self.propagate( + edge_index, + query=query, + key=key, + value=value, + edge_attr=edge_attr, + size=None, + ) + + alpha = self._alpha + self._alpha = None + + if self.concat: + out = out.view(-1, self.heads * self.out_channels) + else: + out = out.mean(dim=1) + + if self.root_weight: + x_r = self.lin_skip(x[1]) + out = out + x_r + + if isinstance(return_attention_weights, bool): + assert alpha is not None + if isinstance(edge_index, Tensor): + return out, (edge_index, alpha) + elif isinstance(edge_index, SparseTensor): + return out, edge_index.set_value(alpha, layout="coo") + else: + return out + + def message( + self, + query_i: Tensor, + key_j: Tensor, + value_j: Tensor, + edge_attr: OptTensor, + index: Tensor, + ptr: OptTensor, + size_i: Optional[int], + ) -> Tensor: + + # Compute edge embed + if self.lin_edge is not None: + assert edge_attr is not None + edge_attr = self.lin_edge(edge_attr).view(-1, self.heads, self.out_channels) + + # Compute attention coefficient + alpha = (query_i * key_j).sum(dim=-1) / math.sqrt(self.out_channels) + alpha = softmax(alpha, index, ptr, size_i) + self._alpha = alpha + alpha = F.dropout(alpha, p=self.dropout, training=self.training) + + out = value_j * alpha.view(-1, self.heads, 1) * edge_attr + return out + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}({self.in_channels}, " + f"{self.out_channels}, heads={self.heads})" + ) From 9cfc910b8312b9e7dab54b78b88e4e1c59297a75 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 12 Jan 2023 10:57:10 -0500 Subject: [PATCH 068/273] multiple frame averaging -- without dealing with inference --- ocpmodels/datasets/data_transforms.py | 2 ++ ocpmodels/preprocessing/frame_averaging.py | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/ocpmodels/datasets/data_transforms.py b/ocpmodels/datasets/data_transforms.py index 8f838ecd7a..5229751a97 100644 --- a/ocpmodels/datasets/data_transforms.py +++ b/ocpmodels/datasets/data_transforms.py @@ -47,9 +47,11 @@ def __init__(self, fa_type=None, fa_frames=None): "random", "det", "all", + "multiple" "se3-random", "se3-det", "se3-all", + "se3-multiple" } if self.fa_type: diff --git a/ocpmodels/preprocessing/frame_averaging.py b/ocpmodels/preprocessing/frame_averaging.py index 5ef1af72c3..b7f2d54742 100644 --- a/ocpmodels/preprocessing/frame_averaging.py +++ b/ocpmodels/preprocessing/frame_averaging.py @@ -33,6 +33,7 @@ def all_frames(eigenvec, pos, cell, fa_frames="random", pos_3D=None, det_index=0 "se3-all", "se3-random", "se3-det", + "se3-multiple", } fa_cell = deepcopy(cell) @@ -76,6 +77,13 @@ def all_frames(eigenvec, pos, cell, fa_frames="random", pos_3D=None, det_index=0 # Return frame(s) depending on method fa_frames if fa_frames == "all" or fa_frames == "se3-all": return all_fa_pos, all_cell, all_rots + + if fa_frames == "multiple" or fa_frames == "se3-multiple": + indexes = torch.bernoulli(torch.tensor([0.5] * len(all_fa_pos))) + all_fa_pos = [a for a, b in zip(all_fa_pos, indexes) if b] + all_cell = [a for a, b in zip(all_cell, indexes) if b] + all_rots = [a for a, b in zip(all_rots, indexes) if b] + return all_fa_pos, all_cell, all_rots elif fa_frames == "det" or fa_frames == "se3-det": return [all_fa_pos[det_index]], [all_cell[det_index]], [all_rots[det_index]] From ae8468acbe51553a98a800100db68dc3fa488262 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 12 Jan 2023 12:48:07 -0500 Subject: [PATCH 069/273] fix FA multiple + complex mp + att heads + skip_co --- configs/exps/icml/test_params.yaml | 13 +++-- configs/models/fanet.yaml | 5 +- ocpmodels/common/flags.py | 12 ++++- ocpmodels/datasets/data_transforms.py | 4 +- ocpmodels/models/fanet.py | 60 ++++++++++++++++------ ocpmodels/preprocessing/frame_averaging.py | 18 ++++--- scripts/gnn_dev.py | 7 ++- 7 files changed, 87 insertions(+), 32 deletions(-) diff --git a/configs/exps/icml/test_params.yaml b/configs/exps/icml/test_params.yaml index 85a48351e5..6c019ddc6d 100644 --- a/configs/exps/icml/test_params.yaml +++ b/configs/exps/icml/test_params.yaml @@ -15,14 +15,19 @@ default: pg_hidden_channels: 0 # shall have been 32 energy_head: False # False ? optim: - max_epochs: 10 - wandb_tags: 'prop-check-ICLM' + max_epochs: 5 + wandb_tags: 'test-extension' + cp_data_to_tmpdir: true runs: - config: sfarinet-s2ef-2M - note: 'All No TMP 1 GPU with grad target' + note: 'Multiple FA with direct_with_gradient_target' model: - regress_forces: direct + regress_forces: direct_with_gradient_target + mp_type: base_with_att + skip_co: add + complex_mp: true + att_heads: 3 optim: batch_size: 192 eval_batch_size: 192 diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml index 7426140a2b..7822609aac 100644 --- a/configs/models/fanet.yaml +++ b/configs/models/fanet.yaml @@ -16,10 +16,13 @@ default: phys_hidden_channels: 0 energy_head: False # can be {False, weighted-av-initial-embeds, weighted-av-final-embeds, pooling, graclus, random} # fanet new features - skip_co: False # output skip connections + skip_co: False # output skip connections {False, "add", "concat"} second_layer_MLP: False # in EmbeddingBlock + complex_mp: False edge_embed_type: rij # {'rij','all_rij','sh', 'all'}) mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} + batch_norm: False # bool + att_heads: 1 # int force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True force_decoder_model_config: simple: diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 30024baa19..519465386b 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -202,7 +202,17 @@ def add_core_args(self): type=str, default="", help="Frame averaging method to use", - choices=["", "random", "det", "all", "se3-all", "se3-random", "se3-det"], + choices=[ + "", + "random", + "det", + "all", + "se3-all", + "se3-random", + "se3-det", + "multiple", + "se3-multiple", + ], ) self.parser.add_argument( "--graph_rewiring", diff --git a/ocpmodels/datasets/data_transforms.py b/ocpmodels/datasets/data_transforms.py index 5229751a97..1e85fff5c0 100644 --- a/ocpmodels/datasets/data_transforms.py +++ b/ocpmodels/datasets/data_transforms.py @@ -47,11 +47,11 @@ def __init__(self, fa_type=None, fa_frames=None): "random", "det", "all", - "multiple" + "multiple", "se3-random", "se3-det", "se3-all", - "se3-multiple" + "se3-multiple", } if self.fa_type: diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 5390027b97..fb84675096 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -5,6 +5,7 @@ from torch import nn from torch.nn import Embedding, Linear from torch_geometric.nn import MessagePassing, TransformerConv, radius_graph +from torch_geometric.nn.norm import BatchNorm, GraphNorm from torch_scatter import scatter from ocpmodels.common.registry import registry @@ -230,11 +231,14 @@ def forward( class InteractionBlock(MessagePassing): - def __init__(self, hidden_channels, num_filters, act, mp_type): + def __init__( + self, hidden_channels, num_filters, act, mp_type, complex_mp, att_heads=1 + ): super(InteractionBlock, self).__init__() self.act = act self.mp_type = mp_type self.hidden_channels = hidden_channels + self.complex_mp = complex_mp if self.mp_type == "simple": self.lin_geom = nn.Linear(num_filters, hidden_channels) @@ -259,7 +263,7 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): self.lin_geom = TransfoAttConv( hidden_channels, hidden_channels, - heads=1, + heads=att_heads, concat=True, root_weight=False, edge_dim=num_filters, @@ -269,7 +273,7 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): self.lin_geom = TransformerConv( hidden_channels, hidden_channels, - heads=1, + heads=att_heads, concat=True, root_weight=False, edge_dim=num_filters, @@ -290,17 +294,26 @@ def __init__(self, hidden_channels, num_filters, act, mp_type): ) self.lin_h = nn.Linear(hidden_channels, hidden_channels) + if self.complex_mp: + self.other_mlp = nn.Linear(hidden_channels, hidden_channels) + + self.reset_parameters() + def reset_parameters(self): - if self.mp_type != "sfarinet": + if self.mp_type not in {"sfarinet", "att", "base_with_att"}: nn.init.xavier_uniform_(self.lin_geom.weight) self.lin_geom.bias.data.fill_(0) - nn.init.xavier_uniform_(self.lin_h.weight) - self.lin_h.bias.data.fill_(0) - if self.mp_type == "updownscale": + if self.complex_mp: + nn.init.xavier_uniform_(self.other_mlp.weight) + self.other_mlp.bias.data.fill_(0) + if self.mp_type in {"updownscale", "base_updownscale", "updown_local_env"}: nn.init.xavier_uniform_(self.lin_up.weight) self.lin_up.bias.data.fill_(0) nn.init.xavier_uniform_(self.lin_down.weight) self.lin_down.bias.data.fill_(0) + else: + nn.init.xavier_uniform_(self.lin_h.weight) + self.lin_h.bias.data.fill_(0) def forward(self, h, edge_index, e): @@ -353,6 +366,9 @@ def forward(self, h, edge_index, e): else: raise ValueError("mp_type provided does not exist") + if self.complex_mp: + h = self.act(self.other_mlp(h)) + return h def message(self, x_j, W, local_env=None): @@ -454,16 +470,18 @@ class FANet(BaseModel): (default: :obj:`4`) num_gaussians (int): The number of gaussians :math:`\mu`. (default: :obj:`50`) - second_layer_MLP (bool): use 2-layers MLP at the end of embedding block. - skip_co (bool): add a skip connection between interaction blocks and + second_layer_MLP (bool): use 2-layers MLP at the end of the Embedding block. + skip_co (str): add a skip connection between each interaction block and energy-head. edge_embed_type (str, in {'rij','all_rij','sh', 'all'}): input feature of the edge embedding block. edge_embed_hidden (int): size of edge representation. could be num_filters or hidden_channels. - mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}): + mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env' + 'updownscale_base', 'updownscale', 'updown_local_env', 'sfarinet'}}): specificies the MP of the interaction block. batch_norm (bool): whether to apply batch norm after every linear layer. + complex_mp (bool); whether to add a second layer MLP at the end of each Interaction """ def __init__(self, **kwargs): @@ -516,6 +534,8 @@ def __init__(self, **kwargs): kwargs["num_filters"], self.act, kwargs["mp_type"], + kwargs["complex_mp"], + kwargs["att_heads"], ) for _ in range(kwargs["num_interactions"]) ] @@ -542,6 +562,10 @@ def __init__(self, **kwargs): else None ) + # Skip co + if self.skip_co == "concat": + self.mlp_skip_co = Linear((kwargs["num_interactions"] + 1), 1) + @conditional_grad(torch.enable_grad()) def forces_forward(self, preds): return self.decoder(preds["hidden_state"]) @@ -598,21 +622,25 @@ def energy_forward(self, data): alpha = self.w_lin(h) else: alpha = None - energy_skip_co = torch.zeros(max(batch) + 1, device=h.device).unsqueeze(1) # Interaction blocks + energy_skip_co = [] for interaction in self.interaction_blocks: if self.skip_co: - energy_skip_co += self.output_block( - h, edge_index, edge_weight, batch, alpha + energy_skip_co.append( + self.output_block(h, edge_index, edge_weight, batch, alpha) ) h = h + interaction(h, edge_index, e) # Output block energy = self.output_block(h, edge_index, edge_weight, batch, alpha) - # skip-connection - if self.skip_co: - energy += energy_skip_co + + # Skip-connection + energy_skip_co.append(energy) + if self.skip_co == "concat": + energy = self.mlp_skip_co(torch.cat(energy_skip_co, dim=1)) + else: + energy = energy_skip_co.sum() preds = {"energy": energy, "pooling_loss": pooling_loss, "hidden_state": h} diff --git a/ocpmodels/preprocessing/frame_averaging.py b/ocpmodels/preprocessing/frame_averaging.py index b7f2d54742..198cff6933 100644 --- a/ocpmodels/preprocessing/frame_averaging.py +++ b/ocpmodels/preprocessing/frame_averaging.py @@ -77,13 +77,19 @@ def all_frames(eigenvec, pos, cell, fa_frames="random", pos_3D=None, det_index=0 # Return frame(s) depending on method fa_frames if fa_frames == "all" or fa_frames == "se3-all": return all_fa_pos, all_cell, all_rots - + if fa_frames == "multiple" or fa_frames == "se3-multiple": - indexes = torch.bernoulli(torch.tensor([0.5] * len(all_fa_pos))) - all_fa_pos = [a for a, b in zip(all_fa_pos, indexes) if b] - all_cell = [a for a, b in zip(all_cell, indexes) if b] - all_rots = [a for a, b in zip(all_rots, indexes) if b] - return all_fa_pos, all_cell, all_rots + index = torch.bernoulli(torch.tensor([0.5] * len(all_fa_pos))) + if index.sum() == 0: + index = random.randint(0, len(all_fa_pos) - 1) + return [all_fa_pos[index]], [all_cell[index]], [all_rots[index]] + if index.sum() == 1: + return [all_fa_pos[index]], [all_cell[index]], [all_rots[index]] + else: + all_fa_pos = [a for a, b in zip(all_fa_pos, index) if b] + all_cell = [a for a, b in zip(all_cell, index) if b] + all_rots = [a for a, b in zip(all_rots, index) if b] + return all_fa_pos, all_cell, all_rots elif fa_frames == "det" or fa_frames == "se3-det": return [all_fa_pos[det_index]], [all_cell[det_index]], [all_rots[det_index]] diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py index 2797921988..9660de9af1 100644 --- a/scripts/gnn_dev.py +++ b/scripts/gnn_dev.py @@ -19,12 +19,15 @@ # Customize args config["graph_rewiring"] = "remove-tag-0" config["frame_averaging"] = "2D" - config["fa_frames"] = "random" # "random" + config["fa_frames"] = "all" # "random" config["test_ri"] = True config["optim"] = {"max_epochs": 1} config["model"] = {"use_pbc": True} - config["model"]["edge_embed_type"] = "rij" + config["model"]["edge_embed_type"] = "all_rij" config["model"]["mp_type"] = "att" + config["model"]["skip_co"] = "add" + config["model"]["complex_mp"] = True + # config["model"]["regress_forces"] = "direct_with_gradient_target" checkpoint_path = None # "checkpoints/2022-04-28-11-42-56-dimenetplusplus/" + "best_checkpoint.pt" From 32368505bcbc583601a463868d321e85f25d7d13 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 12 Jan 2023 13:03:31 -0500 Subject: [PATCH 070/273] fix all embedding and start batch norm --- ocpmodels/models/fanet.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index fb84675096..aaa944d4c6 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -122,7 +122,7 @@ def __init__( elif self.edge_embed_type == "sh": self.lin_e1 = Linear(15, num_filters) elif self.edge_embed_type == "all": - self.lin_e1 = Linear(15, num_filters) + self.lin_e1 = Linear(15 + num_gaussians, num_filters) else: raise ValueError("edge_embedding_type does not exist") @@ -180,7 +180,7 @@ def forward( normalize=False, normalization="component", ) - e = torch.cat((rel_pos, self.sh), dim=1) + e = torch.cat((rel_pos, self.sh, edge_attr), dim=1) e = self.lin_e1(e) e = self.act(e) # can comment out @@ -232,7 +232,7 @@ def forward( class InteractionBlock(MessagePassing): def __init__( - self, hidden_channels, num_filters, act, mp_type, complex_mp, att_heads=1 + self, hidden_channels, num_filters, act, mp_type, complex_mp, att_heads, batch_norm ): super(InteractionBlock, self).__init__() self.act = act @@ -536,6 +536,7 @@ def __init__(self, **kwargs): kwargs["mp_type"], kwargs["complex_mp"], kwargs["att_heads"], + kwargs["batch_norm"] ) for _ in range(kwargs["num_interactions"]) ] From 8a2555ed4b997297e8292231abd7db65d6fcbb5f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 12 Jan 2023 13:53:34 -0500 Subject: [PATCH 071/273] typo in `LinearWarmupCosineAnnealingLR` --- ocpmodels/modules/scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index 7c5c01ac1a..8a4d082188 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -43,7 +43,7 @@ def scheduler_lambda_fn(x): self.scheduler = getattr(lr_scheduler, self.scheduler_type) scheduler_args = self.filter_kwargs(self.optim_config) self.scheduler = self.scheduler(optimizer, **scheduler_args) - elif self.scheduler_type == "WarmupCosineAnnealingLR": + elif self.scheduler_type == "LinearWarmupCosineAnnealingLR": self.warmup_scheduler = warmup.ExponentialWarmup( self.optimizer, warmup_period=self.optim_config["warmup_steps"] ) From 1161ed60b3cb857732145c958d5539497b84e1dd Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 12 Jan 2023 16:51:33 -0500 Subject: [PATCH 072/273] exp manager --- ocpmodels/common/exp_manager.py | 230 ++++++++++++++++++++++++++++++++ ocpmodels/common/utils.py | 4 +- 2 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 ocpmodels/common/exp_manager.py diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py new file mode 100644 index 0000000000..02459ba8ed --- /dev/null +++ b/ocpmodels/common/exp_manager.py @@ -0,0 +1,230 @@ +from orion.client import get_experiment +from pathlib import Path +from collections import defaultdict, Counter +import wandb +from textwrap import dedent +from minydra import resolved_args +import os +import sys + +rundir = Path(os.environ["SCRATCH"]) / "ocp" / "runs" + + +class Manager: + def __init__( + self, + orion_db_path="", + name="", + wandb_path="mila-ocp/ocp-qm", + ): + self.api = wandb.Api() + self.wandb_path = wandb_path + self.wandb_runs = [ + r + for r in self.api.runs(wandb_path) + if "orion_hash_params" in r.config + and name in r.config.get("orion_exp_config_path", "") + ] + self.name = name + self.trial_hparams_to_rundirs = defaultdict(list) + self.exp = get_experiment( + name=name, + storage={ + "database": { + "host": str(orion_db_path), + "type": "pickleddb", + } + }, + ) + self.trials = self.exp.fetch_trials() + self.budgets = self.exp.algorithms.algorithm.budgets + self.total_budgets = sum( + b.n_trials for bracket in self.budgets for b in bracket + ) + self.id_to_trial = {t.id: t for t in self.trials} + self.id_to_wandb_runs = { + t.id: sorted( + [ + r + for r in self.wandb_runs + if r.config["orion_hash_params"] == t.hash_params + ], + key=lambda r: r.config["job_id"], + ) + for t in self.trials + } + self.hash_to_trials = defaultdict(list) + for t in self.trials: + self.hash_to_trials[t.hash_params].append(t) + self.discover_run_dirs() + print(Manager.help()) + print("\n") + print("{:31} : {:4} ".format("Trials in experiment", len(self.trials))) + print("{:31} : {:4}".format("Total expected trials", self.total_budgets)) + print( + "{:31} : {:4} ".format( + "Trials status", + " ".join( + [ + f"{k}->{v}" + for k, v in Counter([t.status for t in self.trials]).items() + ] + ), + ) + ) + print( + "{:31} : {}".format( + "Trial level(=rung) distribution", + " ".join( + [ + f"{k}->{v}" + for k, v in Counter( + map(len, self.hash_to_trials.values()) + ).items() + ] + ), + ) + ) + print( + "{:31} : {:4}".format( + "Existing unique trials executed", len(self.trial_hparams_to_rundirs) + ) + ) + print( + "{:31} : {:4}".format( + "Total existing trial run dirs", + sum(len(v) for v in self.trial_hparams_to_rundirs.values()), + ) + ) + print("{:31} : {:4}".format("Existing wandb runs", len(self.wandb_runs))) + print("{:31} : {}".format("Algorithm's budgets", str(self.budgets))) + + def discover_run_dirs(self): + for unique in rundir.glob("*/*.unique"): + self.trial_hparams_to_rundirs[unique.stem.split("--")[-1]].append( + unique.parent + ) + + def get_dirs_for_trial(self, trial): + if trial.hash_params in self.trial_hparams_to_rundirs: + return self.trial_hparams_to_rundirs[trial.hash_params] + else: + print(f"No run dir for this trial with hparams {trial.hash_params}.") + + def get_trial_for_id(self, id): + if id in self.id_to_trial: + return self.id_to_trial[id] + else: + print("No trial for this id.") + + def get_dirs_for_id(self, id): + return self.get_dirs_for_trial(self.get_trial_for_id(id)) + + def get_reserved_wandb_runs(self): + reserved = {} + for trial_id, wandb_runs in self.id_to_wandb_runs.items(): + trial = self.get_trial_for_id(trial_id) + if trial.status == "reserved": + reserved[trial_id] = {"wandb_run": wandb_runs, "trial": trial} + return reserved + + def print_wandb_query(self): + print( + "WandB runs query:\n" + + "(" + + "|".join( + sorted( + [ + p.name + for runs in self.trial_hparams_to_rundirs.values() + for p in runs + ] + ) + ) + + ")" + ) + + @classmethod + def help(self): + return dedent( + """\ + -------------- + Manager init() + -------------- + + orion_db_path -> (str or pathlib.Path) pointing to the orion db pickle file + name -> (str) unique orion experiment name in the db + wandb_path -> (str) path to the wandb project like "{entity}/{project}" + + ---------- + Attributes + ---------- + + manager.trial_hparams_to_rundirs -> dict {trial.params_hash: [list of run dirs]} + manager.exp -> Orion experiment object + manager.trials -> list of Orion trial objects for this exp + manager.budgets -> list of budget of the exp's algorithm: n_trials and resources associated + manager.total_budgets -> total number of trials expected for this exp + manager.id_to_trial -> dict {trial_id: trial} + manager.id_to_wandb_runs -> dict {trial_id: [list of wandb Run objects]} + manager.hash_to_trials -> dict {hash_params: [list Orion trial objects]} + + ------- + Methods + ------- + + manager.get_dirs_for_trial(trial_obj: orion.Trial) -> list of run dirs for this trial + manager.get_trial_for_id(trial_id: str) -> trial object for this trial_id (wrapper around manager.id_to_trial[trial_id]) + manager.get_dirs_for_id(trial_id: str) -> list of run dirs for this trial_id + manager.get_reserved_wandb_runs() -> dict {trial_id: {"wandb_run": [list of wandb Run objects], "trial": trial}} + get the currently reserved trials and their wandb runs + + -------- + Examples + -------- + + m = Manager(orion_db_path="./data/orion/storage/orion_db.pkl", name="ocp-qm9-orion-debug-v1.0.0", wandb_path="mila-ocp/ocp-qm") + exp_df = m.exp.to_pandas() + reserved_wandbs = m.get_reserved_wandb_runs() + print(list(reserved_wandbs.values())[0]["wandb_run"][0].config["run_dir"]) + """ + ) + + +if __name__ == "__main__": + defaults = { + "help": False, + "name": None, + "wandb_path": None, + "orion_db_path": str( + Path(__file__).resolve().parent.parent.parent + / "data/orion/storage/orion_db.pkl" + ), + } + args = resolved_args(defaults=defaults) + if args.help: + print("🖱 Command-line (default) parameters:") + print("\n".join(" {:15} : {}".format(k, v) for k, v in defaults.items())) + print("\n\n🐍 Example command-line in IPython:") + print( + "In [1]: run ocpmodels/common/exp_manager.py", + "name='ocp-qm9-orion-debug-v1.0.0' wandb_path='mila-ocp/ocp-3'", + ) + print("\n\n🧞 Manager help:") + print(Manager.help()) + sys.exit(0) + + if not args.name: + raise ValueError("Please provide a name for the experiment.") + if not args.wandb_path: + raise ValueError("Please provide a wandb_path.") + + m = Manager( + name=args.name, + wandb_path=args.wandb_path, + orion_db_path=args.orion_db_path, + ) + + m.print_wandb_query() + exp_df = m.exp.to_pandas() + reserved_wandbs = m.get_reserved_wandb_runs() diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 7945a15121..8851943af5 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -52,10 +52,12 @@ def load_orion_exp(args): ), "Must provide orion_unique_exp_name in the command-line or the config file." print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}") + db_path = ROOT / "data" / "orion" / "storage" / "orion_db.pkl" + db_path.parent.mkdir(parents=True, exist_ok=True) experiment = build_experiment( storage={ "database": { - "host": str(ROOT / "data" / "orion" / "storage" / "orion_db.pkl"), + "host": str(db_path), "type": "pickleddb", } }, From d71d64a9deb5d4324af82fcdbddf99c92e0bbee5 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 12 Jan 2023 16:59:44 -0500 Subject: [PATCH 073/273] add dummy exp --- configs/exps/debug/dummy.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 configs/exps/debug/dummy.yaml diff --git a/configs/exps/debug/dummy.yaml b/configs/exps/debug/dummy.yaml new file mode 100644 index 0000000000..8f3b7b570b --- /dev/null +++ b/configs/exps/debug/dummy.yaml @@ -0,0 +1,27 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 24GB + cpus: 4 + gres: gpu:1 + time: 30:00 + partition: unkillable + +default: + wandb_project: ocp-debug + config: schnet-qm9-all + mode: train + wandb_tags: qm9, debug + optim: + batch_size: 64 + max_epochs: -1 + max_steps: 1e3 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + +runs: + - model: + hidden_channels: 128 + - model: + hidden_channels: 64 \ No newline at end of file From daeab01dcf2bdc1f774ee67ea235c0d31ad26a7c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 12 Jan 2023 16:59:56 -0500 Subject: [PATCH 074/273] dummy uses main --- configs/exps/debug/dummy.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/debug/dummy.yaml b/configs/exps/debug/dummy.yaml index 8f3b7b570b..9da5053589 100644 --- a/configs/exps/debug/dummy.yaml +++ b/configs/exps/debug/dummy.yaml @@ -4,7 +4,7 @@ job: cpus: 4 gres: gpu:1 time: 30:00 - partition: unkillable + partition: main default: wandb_project: ocp-debug From 6e7c024fbcb65dc3b793e4bfcce638a1d1a6c4b7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 12 Jan 2023 17:55:15 -0500 Subject: [PATCH 075/273] improve exp manager --- ocpmodels/common/exp_manager.py | 67 ++++++++++++++++++++++----------- sbatch.py | 17 +++++++++ 2 files changed, 62 insertions(+), 22 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 02459ba8ed..1bbe5dd1b0 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -57,7 +57,9 @@ def __init__( for t in self.trials: self.hash_to_trials[t.hash_params].append(t) self.discover_run_dirs() - print(Manager.help()) + self.job_ids = sorted( + [p.name for runs in self.trial_hparams_to_rundirs.values() for p in runs] + ) print("\n") print("{:31} : {:4} ".format("Trials in experiment", len(self.trials))) print("{:31} : {:4}".format("Total expected trials", self.total_budgets)) @@ -87,7 +89,7 @@ def __init__( ) print( "{:31} : {:4}".format( - "Existing unique trials executed", len(self.trial_hparams_to_rundirs) + "Existing unique HP sets executed", len(self.trial_hparams_to_rundirs) ) ) print( @@ -98,9 +100,33 @@ def __init__( ) print("{:31} : {:4}".format("Existing wandb runs", len(self.wandb_runs))) print("{:31} : {}".format("Algorithm's budgets", str(self.budgets))) + sq = set( + [ + j.strip() + for j in os.popen("/opt/slurm/bin/squeue -u $USER -o '%12i'") + .read() + .splitlines()[1:] + ] + ) + running = set(self.job_ids) & sq + waiting = ( + set([j.parent.name for j in rundir.glob(f"*/{self.name}.exp")]) & sq + ) - running + print( + "{:31} : {}".format( + "Jobs currently running:", + f"{len(running)} " + " ".join(running), + ) + ) + print( + "{:31} : {}".format( + "Jobs currently waiting:", + f"{len(waiting)} " + " ".join(waiting), + ) + ) def discover_run_dirs(self): - for unique in rundir.glob("*/*.unique"): + for unique in rundir.glob(f"*/{self.name}--*.unique"): self.trial_hparams_to_rundirs[unique.stem.split("--")[-1]].append( unique.parent ) @@ -125,24 +151,11 @@ def get_reserved_wandb_runs(self): for trial_id, wandb_runs in self.id_to_wandb_runs.items(): trial = self.get_trial_for_id(trial_id) if trial.status == "reserved": - reserved[trial_id] = {"wandb_run": wandb_runs, "trial": trial} + reserved[trial_id] = {"wandb_runs": wandb_runs, "trial": trial} return reserved def print_wandb_query(self): - print( - "WandB runs query:\n" - + "(" - + "|".join( - sorted( - [ - p.name - for runs in self.trial_hparams_to_rundirs.values() - for p in runs - ] - ) - ) - + ")" - ) + print("WandB runs query:\n" + "(" + "|".join(self.job_ids) + ")") @classmethod def help(self): @@ -176,7 +189,7 @@ def help(self): manager.get_dirs_for_trial(trial_obj: orion.Trial) -> list of run dirs for this trial manager.get_trial_for_id(trial_id: str) -> trial object for this trial_id (wrapper around manager.id_to_trial[trial_id]) manager.get_dirs_for_id(trial_id: str) -> list of run dirs for this trial_id - manager.get_reserved_wandb_runs() -> dict {trial_id: {"wandb_run": [list of wandb Run objects], "trial": trial}} + manager.get_reserved_wandb_runs() -> dict {trial_id: {"wandb_runs": [list of wandb Run objects], "trial": trial}} get the currently reserved trials and their wandb runs -------- @@ -186,7 +199,7 @@ def help(self): m = Manager(orion_db_path="./data/orion/storage/orion_db.pkl", name="ocp-qm9-orion-debug-v1.0.0", wandb_path="mila-ocp/ocp-qm") exp_df = m.exp.to_pandas() reserved_wandbs = m.get_reserved_wandb_runs() - print(list(reserved_wandbs.values())[0]["wandb_run"][0].config["run_dir"]) + print(list(reserved_wandbs.values())[0]["wandb_runs"][0].config["run_dir"]) """ ) @@ -215,10 +228,20 @@ def help(self): sys.exit(0) if not args.name: - raise ValueError("Please provide a name for the experiment.") + raise ValueError( + "Please provide `name=` for the experiment." + + " See `$ python exp_manager.py help`" + ) if not args.wandb_path: - raise ValueError("Please provide a wandb_path.") + raise ValueError( + "Please provide `wandb_path='{entity}/{project}}'`." + + " See `$ python exp_manager.py help`" + ) + print( + "💃 Status of experiment", + f"'{args.name}' and wandb entity/project '{args.wandb_path}':", + ) m = Manager( name=args.name, wandb_path=args.wandb_path, diff --git a/sbatch.py b/sbatch.py index bb0ff8ab36..ea97c8a7df 100644 --- a/sbatch.py +++ b/sbatch.py @@ -6,6 +6,7 @@ from shutil import copyfile import sys import re +import yaml template = """\ #!/bin/bash @@ -192,6 +193,21 @@ def add_jobid_to_log(j, command_line, exp_name=None): logfile.write_text("\n".join(lines)) +def write_orion_config(args, outdir): + if "--orion_exp_config_path=" not in args.get("py_args", ""): + return + orion_yaml_path = ( + args.py_args.split("--orion_exp_config_path=")[-1] + .split(" --")[0] + .replace("'", "") + ) + copyfile(orion_yaml_path, outdir / "orion_exp_config.yaml") + config = yaml.safe_load(Path(orion_yaml_path).read_text()) + if "unique_exp_name" in config: + unique_exp_name = config["unique_exp_name"] + (outdir / f"{unique_exp_name}.exp").touch() + + if __name__ == "__main__": # has the submission been successful? success = False @@ -348,6 +364,7 @@ def add_jobid_to_log(j, command_line, exp_name=None): print("Creating directory", str(output_parent)) output_parent.mkdir(parents=True, exist_ok=True) copyfile(script_path, output_parent / script_path.name) + write_orion_config(args, output_parent) if not args.verbose: print("Submitted batch job", jobid) add_jobid_to_log(jobid, sbatch_command_line, args.exp_name) From b0fc220953a6bf865535728614de6b586a37800c Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 13 Jan 2023 07:43:30 -0500 Subject: [PATCH 076/273] fix skip co --- configs/exps/gnn/edge_embed_type.yaml | 35 +++- configs/exps/gnn/edge_embed_type_s2ef.yaml | 2 +- configs/exps/gnn/mp_type_3.yaml | 182 +++++++++++++++++++++ ocpmodels/models/fanet.py | 20 ++- 4 files changed, 228 insertions(+), 11 deletions(-) create mode 100644 configs/exps/gnn/mp_type_3.yaml diff --git a/configs/exps/gnn/edge_embed_type.yaml b/configs/exps/gnn/edge_embed_type.yaml index b7a0418fe2..9354db6d8a 100644 --- a/configs/exps/gnn/edge_embed_type.yaml +++ b/configs/exps/gnn/edge_embed_type.yaml @@ -22,19 +22,42 @@ default: cp_data_to_tmpdir: true runs: - - config: sfarinet-is2re-all - note: 'Sfarinet baseline sym' - frame_averaging: 2D - fa_frames: se3-random - - config: sfarinet-is2re-all + - config: fanet-is2re-all # 2678275 note: 'all rij' frame_averaging: 2D fa_frames: se3-random model: edge_embed_type: all_rij - - config: sfarinet-is2re-all + mp_type: base + - config: fanet-is2re-all # 2678276 note: 'all' frame_averaging: 2D fa_frames: se3-random model: edge_embed_type: all + mp_type: base + - config: sfarinet-is2re-all # 2678277 + note: 'all rij sfarinet' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all_rij + mp_type: sfarinet + - config: sfarinet-is2re-all # 2678278 + note: 'sfarinet all' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all + mp_type: sfarinet + - config: sfarinet-is2re-all # 2678279 + note: 'sfarinet all' + frame_averaging: 2D + fa_frames: se3-random + model: + edge_embed_type: all + mp_type: base + skip_co: "concat" + complex_mp: true + batch_norm: true + second_layer_mlp: true \ No newline at end of file diff --git a/configs/exps/gnn/edge_embed_type_s2ef.yaml b/configs/exps/gnn/edge_embed_type_s2ef.yaml index 5ad120c07d..5b642e33c8 100644 --- a/configs/exps/gnn/edge_embed_type_s2ef.yaml +++ b/configs/exps/gnn/edge_embed_type_s2ef.yaml @@ -14,7 +14,7 @@ default: phys_embeds: True tag_hidden_channels: 64 pg_hidden_channels: 0 # shall have been 32 - energy_head: 'weighted-av-initial-embeds' # False ? + energy_head: False # False ? regress_forces: direct_with_gradient_target wandb_tags: 's2ef-archi-tests' optim: diff --git a/configs/exps/gnn/mp_type_3.yaml b/configs/exps/gnn/mp_type_3.yaml new file mode 100644 index 0000000000..8ba039780b --- /dev/null +++ b/configs/exps/gnn/mp_type_3.yaml @@ -0,0 +1,182 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 + energy_head: False + edge_embed_type: all_rij + wandb_tags: 'mp-type' + optim: + max_epochs: 10 + batch_size: 256 + eval_batch_size: 256 + cp_data_to_tmpdir: true + +runs: + - config: fanet-is2re-all + note: 'simple' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: simple + - config: fanet-is2re-all + note: 'updown_scale' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updown_scale + - config: fanet-is2re-all + note: 'local_env' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: local_env + - config: fanet-is2re-all + note: 'updown_local_env' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updown_local_env + - config: fanet-is2re-all + note: 'base_with_att' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base_with_att + - config: fanet-is2re-all + note: 'att' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: att + - config: fanet-is2re-all + note: 'base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + - config: fanet-is2re-all + note: 'updownscale_base' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + - config: fanet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + + - config: fanet-is2re-all + note: 'sfarinet' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + skip_co: concat + - config: fanet-is2re-all + note: 'att skip co' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: att + skip_co: concat + - config: fanet-is2re-all + note: 'local_env add' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: local_env + skip_co: add + - config: fanet-is2re-all + note: 'base complex mp' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + complex_mp: true + - config: fanet-is2re-all + note: 'simple complex mp' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: simple + complex_mp: true + - config: fanet-is2re-all + note: 'updown_local_env' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updown_local_env + second_layer_mlp: true + - config: fanet-is2re-all + note: 'base_with_att' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base_with_att + second_layer_mlp: true + - config: fanet-is2re-all + note: 'sfarinet ' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + batch_norm: true + - config: fanet-is2re-all + note: 'base_updownscale' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base_updownscale + batch_norm: true + - config: fanet-is2re-all + note: 'simple bigger layers' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: simple + num_filters: 500 + num_gaussians: 200 + hidden_channels: 500 + num_interactions: 4 + tag_hidden_channels: 128 + - config: fanet-is2re-all + note: 'more interactions and bigger filters' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: simple + num_filters: 500 + num_gaussians: 100 + num_interactions: 6 + tag_hidden_channels: 128 + - config: fanet-is2re-all + note: 'smaller lr and bigger gamma' + frame_averaging: 2D + fa_frames: se3-random + optim: + lr_initial: 0.0005 + lr_gamma: 0.4 + - config: fanet-is2re-all + note: 'bigger cutoff' + frame_averaging: 2D + fa_frames: se3-random + model: + cutoff: 10.0 + - config: fanet-is2re-all + note: 'DA' + frame_averaging: DA + optim: + max_epochs: 15 \ No newline at end of file diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index aaa944d4c6..471e7c227e 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -232,13 +232,23 @@ def forward( class InteractionBlock(MessagePassing): def __init__( - self, hidden_channels, num_filters, act, mp_type, complex_mp, att_heads, batch_norm + self, + hidden_channels, + num_filters, + act, + mp_type, + complex_mp, + att_heads, + batch_norm, ): super(InteractionBlock, self).__init__() self.act = act self.mp_type = mp_type self.hidden_channels = hidden_channels self.complex_mp = complex_mp + self.batch_norm = batch_norm + if batch_norm: + self.graph_norm = GraphNorm(hidden_channels) if self.mp_type == "simple": self.lin_geom = nn.Linear(num_filters, hidden_channels) @@ -360,6 +370,8 @@ def forward(self, h, edge_index, e): h = self.lin_up(h) elif self.mp_type in {"base", "simple", "sfarinet"}: + if self.batch_norm: + h = self.graph_norm(h) h = self.propagate(edge_index, x=h, W=e) # propagate h = self.act(self.lin_h(h)) @@ -536,7 +548,7 @@ def __init__(self, **kwargs): kwargs["mp_type"], kwargs["complex_mp"], kwargs["att_heads"], - kwargs["batch_norm"] + kwargs["batch_norm"], ) for _ in range(kwargs["num_interactions"]) ] @@ -640,8 +652,8 @@ def energy_forward(self, data): energy_skip_co.append(energy) if self.skip_co == "concat": energy = self.mlp_skip_co(torch.cat(energy_skip_co, dim=1)) - else: - energy = energy_skip_co.sum() + elif self.skip_co == "add": + energy = sum(energy_skip_co) preds = {"energy": energy, "pooling_loss": pooling_loss, "hidden_state": h} From 241819d09429c45f52b50c21096dfa4aa0f29afc Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 13 Jan 2023 10:29:50 -0500 Subject: [PATCH 077/273] batch norm for all --- configs/exps/gnn/batch_norm.yaml | 53 ++++++++++++++++++++++++++++++++ ocpmodels/models/fanet.py | 14 +++++++-- scripts/gnn_dev.py | 8 +++-- 3 files changed, 70 insertions(+), 5 deletions(-) create mode 100644 configs/exps/gnn/batch_norm.yaml diff --git a/configs/exps/gnn/batch_norm.yaml b/configs/exps/gnn/batch_norm.yaml new file mode 100644 index 0000000000..4655fa3a9b --- /dev/null +++ b/configs/exps/gnn/batch_norm.yaml @@ -0,0 +1,53 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 0 + energy_head: False + edge_embed_type: all_rij + wandb_tags: 'mp-type' + optim: + max_epochs: 10 + batch_size: 256 + eval_batch_size: 256 + cp_data_to_tmpdir: true + +runs: + - config: fanet-is2re-all + note: 'batch norm after propagate Interaction' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + batch_norm: True + - config: fanet-is2re-all + note: 'batch norm after propagate Interaction' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: att + batch_norm: True + - config: fanet-is2re-all + note: 'batch norm after propagate Interaction' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: local_env + batch_norm: True + - config: fanet-is2re-all + note: 'batch norm after propagate Interaction' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: sfarinet + batch_norm: True diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 471e7c227e..f0ac255805 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -345,20 +345,28 @@ def forward(self, h, edge_index, e): if self.mp_type == "updownscale" or self.mp_type == "updownscale_base": h = self.act(self.lin_down(h)) # downscale node rep. h = self.propagate(edge_index, x=h, W=e) # propagate + if self.batch_norm: + h = self.act(self.graph_norm(h)) h = self.act(self.lin_up(h)) # upscale node rep. elif self.mp_type == "att": h = self.lin_geom(h, edge_index, edge_attr=e) + if self.batch_norm: + h = self.act(self.graph_norm(h)) h = self.act(self.lin_h(h)) elif self.mp_type == "base_with_att": h = self.lin_geom(h, edge_index, edge_attr=e) # propagate is inside + if self.batch_norm: + h = self.act(self.graph_norm(h)) h = self.act(self.lin_h(h)) elif self.mp_type == "local_env": chi = self.propagate(edge_index, x=h, W=e, local_env=True) h = self.propagate(edge_index, x=h, W=e) # propagate h = h + chi + if self.batch_norm: + h = self.act(self.graph_norm(h)) h = h = self.act(self.lin_h(h)) elif self.mp_type == "updown_local_env": @@ -366,13 +374,15 @@ def forward(self, h, edge_index, e): chi = self.propagate(edge_index, x=h, W=e, local_env=True) e = self.lin_geom(e) h = self.propagate(edge_index, x=h, W=e) # propagate + if self.batch_norm: + h = self.act(self.graph_norm(h)) h = torch.cat((h, chi), dim=1) h = self.lin_up(h) elif self.mp_type in {"base", "simple", "sfarinet"}: - if self.batch_norm: - h = self.graph_norm(h) h = self.propagate(edge_index, x=h, W=e) # propagate + if self.batch_norm: + h = self.act(self.graph_norm(h)) h = self.act(self.lin_h(h)) else: diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py index 9660de9af1..0f59d42311 100644 --- a/scripts/gnn_dev.py +++ b/scripts/gnn_dev.py @@ -19,14 +19,16 @@ # Customize args config["graph_rewiring"] = "remove-tag-0" config["frame_averaging"] = "2D" - config["fa_frames"] = "all" # "random" + config["fa_frames"] = "random" # "random" config["test_ri"] = True config["optim"] = {"max_epochs": 1} config["model"] = {"use_pbc": True} config["model"]["edge_embed_type"] = "all_rij" - config["model"]["mp_type"] = "att" - config["model"]["skip_co"] = "add" + config["model"]["mp_type"] = "base" + config["model"]["skip_co"] = False + config["model"]["att_heads"] = 3 config["model"]["complex_mp"] = True + config["model"]["batch_norm"] = True # config["model"]["regress_forces"] = "direct_with_gradient_target" checkpoint_path = None From 0f8ef1f9290eac79e7d2458f87972ac552ed7b1c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 10:52:11 -0500 Subject: [PATCH 078/273] handle multiplicative factor for Orion sampling --- main.py | 12 +++++++-- ocpmodels/common/utils.py | 54 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 73d75f5e15..8f83773364 100644 --- a/main.py +++ b/main.py @@ -21,6 +21,7 @@ from ocpmodels.common.utils import ( JOB_ID, auto_note, + apply_mult_factor, build_config, continue_from_slurm_job_id, continue_orion_exp, @@ -76,8 +77,15 @@ def run(self, orion_exp=None): if distutils.is_master(): if orion_exp: orion_trial = orion_exp.suggest(1) - self.hparams = unflatten_dict(orion_trial.params, sep="/") - self.hparams["orion_hash_params"] = orion_trial.hash_params + self.hparams = unflatten_dict( + apply_mult_factor( + orion_trial.hash_params, + self.trainer_config.get("orion_mult_factor"), + sep="/", + ), + sep="/", + ) + self.hparams["orion_hash_params"] = orion_trial.params self.hparams["orion_unique_exp_name"] = orion_exp.name should_be_0 = distutils.get_rank() diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 8851943af5..9203819107 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -44,6 +44,60 @@ JOB_ID = os.environ.get("SLURM_JOB_ID") +def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."): + """ + Multiplies all values of orion_hparams listed in mult_factor_dict["targets"] + by mult_factor_dict["value"]. + + eg: + >>> orion_hparams = { + "model/hidden_channels": 4, + "model/num_layers": 4, + "optim/batch_size": 4, + "optim/initial_lr": 0.001, + "frame_averaging": "", + } + + >>> mult_factor_dict = {"value": 32, "targets": "hidden_channels, batch_size"} + + >>> apply_mult_factor(orion_hparams, mult_factor_dict, sep="/") + { + "model/hidden_channels": 128, + "model/num_layers": 4, + "optim/batch_size": 128, + "optim/initial_lr": 0.001, + "frame_averaging": "" + } + + Args: + orion_hparams (_type_): _description_ + mult_factor_dict (_type_): _description_ + sep (str, optional): _description_. Defaults to ".". + + Returns: + _type_: _description_ + """ + if not mult_factor_dict: + return orion_hparams + if not isinstance(mult_factor_dict, dict): + print( + f">>> Warning: ignoring apply_mult_factor, not a dict: {mult_factor_dict}." + ) + if "value" not in mult_factor_dict or "targets" not in mult_factor_dict: + print( + ">>> Warning: ignoring apply_mult_factor, " + + " missing 'value' or 'targets' keys: {}.".format(mult_factor_dict) + ) + value, targets = mult_factor_dict["value"], mult_factor_dict["targets"] + targets = set([t.strip() for t in targets.split(",")]) + updated_hparams = copy.deepcopy(orion_hparams) + for k, v in orion_hparams.items(): + target = k.split(sep)[-1] + if target in targets: + updated_hparams[k] = v * value + return updated_hparams + + def load_orion_exp(args): exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text()) From 9a578f8fed34da45317378b7c4bdbacea1c4786c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 10:52:33 -0500 Subject: [PATCH 079/273] refactor prints to `print_status` --- ocpmodels/common/exp_manager.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 1bbe5dd1b0..858250dfdb 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -61,10 +61,13 @@ def __init__( [p.name for runs in self.trial_hparams_to_rundirs.values() for p in runs] ) print("\n") - print("{:31} : {:4} ".format("Trials in experiment", len(self.trials))) - print("{:31} : {:4}".format("Total expected trials", self.total_budgets)) + self.print_status() + + def print_status(self): + print("{:32} : {:4} ".format("Trials in experiment", len(self.trials))) + print("{:32} : {:4}".format("Total expected trials", self.total_budgets)) print( - "{:31} : {:4} ".format( + "{:32} : {:4} ".format( "Trials status", " ".join( [ @@ -75,7 +78,7 @@ def __init__( ) ) print( - "{:31} : {}".format( + "{:32} : {}".format( "Trial level(=rung) distribution", " ".join( [ @@ -88,18 +91,18 @@ def __init__( ) ) print( - "{:31} : {:4}".format( + "{:32} : {:4}".format( "Existing unique HP sets executed", len(self.trial_hparams_to_rundirs) ) ) print( - "{:31} : {:4}".format( + "{:32} : {:4}".format( "Total existing trial run dirs", sum(len(v) for v in self.trial_hparams_to_rundirs.values()), ) ) - print("{:31} : {:4}".format("Existing wandb runs", len(self.wandb_runs))) - print("{:31} : {}".format("Algorithm's budgets", str(self.budgets))) + print("{:32} : {:4}".format("Existing wandb runs", len(self.wandb_runs))) + print("{:32} : {}".format("Algorithm's budgets", str(self.budgets))) sq = set( [ j.strip() @@ -113,13 +116,13 @@ def __init__( set([j.parent.name for j in rundir.glob(f"*/{self.name}.exp")]) & sq ) - running print( - "{:31} : {}".format( + "{:32} : {}".format( "Jobs currently running:", f"{len(running)} " + " ".join(running), ) ) print( - "{:31} : {}".format( + "{:32} : {}".format( "Jobs currently waiting:", f"{len(waiting)} " + " ".join(waiting), ) @@ -155,7 +158,7 @@ def get_reserved_wandb_runs(self): return reserved def print_wandb_query(self): - print("WandB runs query:\n" + "(" + "|".join(self.job_ids) + ")") + print(f"{'WandB runs query:':32}\n" + "(" + "|".join(self.job_ids) + ")") @classmethod def help(self): From fc904290d7c8bc5f8686f6ef995677005a359b49 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 10:53:49 -0500 Subject: [PATCH 080/273] update exps --- configs/exps/debug/orion.yaml | 8 +-- configs/exps/icml/is2re-10k/fanet-orion.yaml | 60 ++++++++++++++++++++ configs/exps/qm7x/schnet-from-spooky.yaml | 17 +++++- 3 files changed, 78 insertions(+), 7 deletions(-) create mode 100644 configs/exps/icml/is2re-10k/fanet-orion.yaml diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml index 93eb5f5de4..7c7a528837 100644 --- a/configs/exps/debug/orion.yaml +++ b/configs/exps/debug/orion.yaml @@ -4,7 +4,7 @@ job: cpus: 4 gres: gpu:16gb:1 time: 1:00:00 - partition: main + partition: long code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab env: ocp-a100 @@ -41,10 +41,10 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: ocp-qm9-orion-debug-v1.0.0 + unique_exp_name: ocp-qm9-orion-debug-v1.0.1 space: - optim/max_steps: fidelity(1e5, 1e6, base=3) + optim/max_steps: fidelity(1e3, 1e4, base=3) optim/batch_size: uniform(32, 128, discrete=True) optim/lr_initial: loguniform(1e-5, 5e-3, precision=2) model/num_gaussians: uniform(16, 200, discrete=True) @@ -56,5 +56,5 @@ orion: algorithms: asha: seed: 123 - num_rungs: 5 + num_rungs: 4 num_brackets: 1 diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml new file mode 100644 index 0000000000..cf89e45531 --- /dev/null +++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml @@ -0,0 +1,60 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 24GB + cpus: 4 + gres: gpu:16gb:1 + time: 2:00:00 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab + env: ocp-a100 + +default: + wandb_project: ocp-4 + config: fanet-is2re-10k + mode: train + test_ri: true + wandb_tags: is2re-10k, orion + cp_data_to_tmpdir: true + graph_rewiring: remove-tag-0 + optim: + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + decay_steps: max_steps + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: -1 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, batch_norm + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: batch_size, num_gaussians, hidden_channels, num_filters, num_interactions, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 20 + + unique_exp_name: fanet-is2re-10k-v1.0.0 + + space: + optim/max_epochs: fidelity(10, 50, base=4) + optim/batch_size: uniform(1, 16, discrete=True) + optim/lr_initial: loguniform(1e-4, 5e-3, precision=2) + model/num_gaussians: uniform(20, 150, discrete=True) + model/hidden_channels: uniform(1, 16, discrete=True) + model/num_filters: uniform(1, 16, discrete=True) + model/num_interactions: uniform(1, 7, discrete=True) + model/phys_embeds: choices([True, False]) + model/batch_norm: choices([True, False]) + model/pg_hidden_channels: uniform(0, 3, discrete=True) + model/phys_hidden_channels: uniform(0, 3, discrete=True) + model/tag_hidden_channels: uniform(0, 3, discrete=True) + model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) + model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"]) + model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) + algorithms: + asha: + seed: 123 + num_rungs: 5 + num_brackets: 1 diff --git a/configs/exps/qm7x/schnet-from-spooky.yaml b/configs/exps/qm7x/schnet-from-spooky.yaml index 6f2164c4d6..5c8dce6fe9 100644 --- a/configs/exps/qm7x/schnet-from-spooky.yaml +++ b/configs/exps/qm7x/schnet-from-spooky.yaml @@ -26,14 +26,15 @@ default: optim: batch_size, lr_initial optim: batch_size: 10 - warmup_steps: 1000 + warmup_steps: 3000 lr_initial: 0.0001 # parameters EMA # ema_decay: 0.999 - decay_steps: 750000 + decay_steps: max_steps scheduler: decay_rate: 0.01 - max_steps: 1000000 + max_steps: 2000000 + eval_every: 50000 model: hidden_channels: 128 num_filters: 128 @@ -46,6 +47,16 @@ runs: ema_decay: 0.999 - optim: scheduler: LinearWarmupCosineAnnealingLR + model: + cutoff: 6.0 + - optim: + scheduler: LinearWarmupCosineAnnealingLR + model: + num_gaussians: 100 + - optim: + scheduler: LinearWarmupCosineAnnealingLR + model: + num_filters: 256 - optim: ema_decay: 0.999 scheduler: LinearWarmupCosineAnnealingLR From b48b9a14c6e79af83ee50ec3d2a0b7b68ae725f4 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 10:57:04 -0500 Subject: [PATCH 081/273] create orion search yamls *after* confirm --- launch_exp.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index e40199df7a..5e0f1b0d04 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -180,9 +180,6 @@ def get_args_or_exp(key, args, exp): exp["unique_exp_name"] = unique_exp_name search_path = orion_base / "search-spaces" / f"{ts}-{unique_exp_name}.yaml" - search_path.parent.mkdir(exist_ok=True, parents=True) - assert not search_path.exists() - search_path.write_text(dump(exp["orion"])) runs = [ { "orion_exp_config_path": str(search_path), @@ -230,6 +227,11 @@ def get_args_or_exp(key, args, exp): if confirm == "y": try: + if "orion" in exp: + search_path.parent.mkdir(exist_ok=True, parents=True) + assert not search_path.exists() + search_path.write_text(dump(exp["orion"])) + outputs = [] for c, command in enumerate(commands): print(f"Launching job {c:3}", end="\r") From ac4b3ce0e8cff28acbe48a22fe3c646b9227cad6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 11:08:02 -0500 Subject: [PATCH 082/273] typo in run --- main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 8f83773364..5e4f99c427 100644 --- a/main.py +++ b/main.py @@ -79,13 +79,13 @@ def run(self, orion_exp=None): orion_trial = orion_exp.suggest(1) self.hparams = unflatten_dict( apply_mult_factor( - orion_trial.hash_params, + orion_trial.params, self.trainer_config.get("orion_mult_factor"), sep="/", ), sep="/", ) - self.hparams["orion_hash_params"] = orion_trial.params + self.hparams["orion_hash_params"] = orion_trial.hash_params self.hparams["orion_unique_exp_name"] = orion_exp.name should_be_0 = distutils.get_rank() From a0a9c0089c0cf672b015a2d341a32ebd0ef244ab Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 11:28:56 -0500 Subject: [PATCH 083/273] allow "" for fa_frames --- ocpmodels/datasets/data_transforms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocpmodels/datasets/data_transforms.py b/ocpmodels/datasets/data_transforms.py index 1e85fff5c0..64556a0038 100644 --- a/ocpmodels/datasets/data_transforms.py +++ b/ocpmodels/datasets/data_transforms.py @@ -44,6 +44,7 @@ def __init__(self, fa_type=None, fa_frames=None): "DA", } assert self.fa_frames in { + "", # equivalent to random, necessary still for sweeps "random", "det", "all", From b70a9ee54b5dc0f1e1e608e8d211791681de7ce1 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 11:29:03 -0500 Subject: [PATCH 084/273] add watch mode --- ocpmodels/common/exp_manager.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 858250dfdb..75d7566975 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -6,6 +6,8 @@ from minydra import resolved_args import os import sys +import time +from datetime import datetime rundir = Path(os.environ["SCRATCH"]) / "ocp" / "runs" @@ -216,6 +218,7 @@ def help(self): Path(__file__).resolve().parent.parent.parent / "data/orion/storage/orion_db.pkl" ), + "watch": -1, } args = resolved_args(defaults=defaults) if args.help: @@ -254,3 +257,30 @@ def help(self): m.print_wandb_query() exp_df = m.exp.to_pandas() reserved_wandbs = m.get_reserved_wandb_runs() + + if args.watch and args.watch > 0: + if args.watch < 15: + print("Cannot watch to often, setting to 15 seconds.") + args.watch = 15 + try: + print("👀 Watching for exp status every every", args.watch, "seconds.") + while True: + time.sleep(args.watch) + print() + print("=" * 30) + print("=" * 30) + print() + print( + "💃 Status of experiment", + f"'{args.name}' and wandb entity/project '{args.wandb_path}' @", + str(datetime.now()).split(".")[0], + ) + print() + m = Manager( + name=args.name, + wandb_path=args.wandb_path, + orion_db_path=args.orion_db_path, + ) + except KeyboardInterrupt: + print("👋 Exiting.") + sys.exit(0) From 721521181fac2fd7374459c8851db2e7955f9a88 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 11:37:16 -0500 Subject: [PATCH 085/273] add clean dirs command output --- launch_exp.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/launch_exp.py b/launch_exp.py index 5e0f1b0d04..b7495e02f4 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -17,6 +17,12 @@ def util_strings(jobs, yaml_comments=False): s = "All jobs launched: " + ", ".join(jobs) s += "\nCancel experiment: scancel " + " ".join(jobs) s += "\nWandB query for dashboard: (" + "|".join(jobs) + ")" + s += ( + "\n Delete experiment run dirs: " + + 'ocp_run_dirs="$SCRATCH/ocp/runs; for jid in ' + + " ".join(jobs) + + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done"' + ) if yaml_comments: s = "\n".join(["# " + line for line in s.splitlines()]) return s @@ -173,6 +179,8 @@ def get_args_or_exp(key, args, exp): if "orion" in exp: orion_base = ROOT / "data" / "orion" assert "runs" not in exp, "Cannot use both Orion and runs" + assert "space" in exp["orion"], "Must specify orion.space" + assert "algorithms" in exp["orion"], "Must specify orion.algorithms" n_jobs = get_args_or_exp("n_jobs", args, exp["orion"]) unique_exp_name = get_args_or_exp("unique_exp_name", args, exp["orion"]) From 9e0325f1aba103590c9c607b6f98884f4c501ddc Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 11:38:00 -0500 Subject: [PATCH 086/273] typo in print --- launch_exp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index b7495e02f4..81547103c7 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -19,9 +19,9 @@ def util_strings(jobs, yaml_comments=False): s += "\nWandB query for dashboard: (" + "|".join(jobs) + ")" s += ( "\n Delete experiment run dirs: " - + 'ocp_run_dirs="$SCRATCH/ocp/runs; for jid in ' + + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' + " ".join(jobs) - + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done"' + + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done' ) if yaml_comments: s = "\n".join(["# " + line for line in s.splitlines()]) From 2fb9ec6c1803558672dd39526e6320c60045abdc Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 12:15:36 -0500 Subject: [PATCH 087/273] auto `max_steps` if `max_epochs` --- ocpmodels/trainers/base_trainer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index b5a7b71001..a324f73787 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -250,6 +250,11 @@ def load_datasets(self): f"dataset length ({len(self.datasets[split])}),", f"and batch_size ({batch_size})\n", ) + else: + self.config["optim"]["max_steps"] = int( + self.config["optim"]["max_epochs"] + * (len(self.datasets[split]) / batch_size) + ) self.samplers[split] = self.get_sampler( self.datasets[split], batch_size, shuffle=shuffle From c0a32e933ac253fbe580133029d7cfa2f16e38cc Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 14:25:15 -0500 Subject: [PATCH 088/273] batch_norm to graph_norm (& updown GN fix) --- configs/exps/gnn/batch_norm.yaml | 8 +++--- configs/exps/gnn/edge_embed_type.yaml | 2 +- configs/exps/gnn/mp_type_3.yaml | 4 +-- configs/exps/icml/is2re-10k/fanet-orion.yaml | 28 +++++++++++--------- configs/models/fanet.yaml | 2 +- launch_exp.py | 9 ++++--- ocpmodels/models/fanet.py | 26 +++++++++--------- scripts/gnn_dev.py | 2 +- 8 files changed, 43 insertions(+), 38 deletions(-) diff --git a/configs/exps/gnn/batch_norm.yaml b/configs/exps/gnn/batch_norm.yaml index 4655fa3a9b..df99a320eb 100644 --- a/configs/exps/gnn/batch_norm.yaml +++ b/configs/exps/gnn/batch_norm.yaml @@ -29,25 +29,25 @@ runs: fa_frames: se3-random model: mp_type: base - batch_norm: True + graph_norm: True - config: fanet-is2re-all note: 'batch norm after propagate Interaction' frame_averaging: 2D fa_frames: se3-random model: mp_type: att - batch_norm: True + graph_norm: True - config: fanet-is2re-all note: 'batch norm after propagate Interaction' frame_averaging: 2D fa_frames: se3-random model: mp_type: local_env - batch_norm: True + graph_norm: True - config: fanet-is2re-all note: 'batch norm after propagate Interaction' frame_averaging: 2D fa_frames: se3-random model: mp_type: sfarinet - batch_norm: True + graph_norm: True diff --git a/configs/exps/gnn/edge_embed_type.yaml b/configs/exps/gnn/edge_embed_type.yaml index 9354db6d8a..34793b8c4e 100644 --- a/configs/exps/gnn/edge_embed_type.yaml +++ b/configs/exps/gnn/edge_embed_type.yaml @@ -59,5 +59,5 @@ runs: mp_type: base skip_co: "concat" complex_mp: true - batch_norm: true + graph_norm: true second_layer_mlp: true \ No newline at end of file diff --git a/configs/exps/gnn/mp_type_3.yaml b/configs/exps/gnn/mp_type_3.yaml index 8ba039780b..ec14d42dcd 100644 --- a/configs/exps/gnn/mp_type_3.yaml +++ b/configs/exps/gnn/mp_type_3.yaml @@ -133,14 +133,14 @@ runs: fa_frames: se3-random model: mp_type: sfarinet - batch_norm: true + graph_norm: true - config: fanet-is2re-all note: 'base_updownscale' frame_averaging: 2D fa_frames: se3-random model: mp_type: base_updownscale - batch_norm: true + graph_norm: true - config: fanet-is2re-all note: 'simple bigger layers' frame_averaging: 2D diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml index cf89e45531..f073505c5e 100644 --- a/configs/exps/icml/is2re-10k/fanet-orion.yaml +++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml @@ -3,7 +3,7 @@ job: mem: 24GB cpus: 4 gres: gpu:16gb:1 - time: 2:00:00 + time: 30:00 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab env: ocp-a100 @@ -17,42 +17,44 @@ default: cp_data_to_tmpdir: true graph_rewiring: remove-tag-0 optim: - warmup_steps: 3000 + warmup_steps: 500 # parameters EMA ema_decay: 0.999 decay_steps: max_steps scheduler: LinearWarmupCosineAnnealingLR max_epochs: -1 note: - model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, batch_norm + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm optim: batch_size, lr_initial _root_: frame_averaging, fa_frames orion_mult_factor: value: 32 - targets: batch_size, num_gaussians, hidden_channels, num_filters, num_interactions, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + targets: batch_size, hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-is2re-10k-v1.0.0 + unique_exp_name: fanet-is2re-10k-v1.1.0 space: - optim/max_epochs: fidelity(10, 50, base=4) - optim/batch_size: uniform(1, 16, discrete=True) + optim/max_epochs: fidelity(20, 100, base=4) + optim/batch_size: uniform(1, 10, discrete=True) optim/lr_initial: loguniform(1e-4, 5e-3, precision=2) - model/num_gaussians: uniform(20, 150, discrete=True) + # model/graph_norm: choices([True, False]) + model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"]) + model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) model/hidden_channels: uniform(1, 16, discrete=True) + model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) model/num_filters: uniform(1, 16, discrete=True) + model/num_gaussians: uniform(20, 150, discrete=True) model/num_interactions: uniform(1, 7, discrete=True) - model/phys_embeds: choices([True, False]) - model/batch_norm: choices([True, False]) model/pg_hidden_channels: uniform(0, 3, discrete=True) + model/phys_embeds: choices([True, False]) model/phys_hidden_channels: uniform(0, 3, discrete=True) model/tag_hidden_channels: uniform(0, 3, discrete=True) - model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) - model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"]) - model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) + frame_averaging: choices(["", "2D", "3D", "DA"]) + fa_frames: choices(["", "random", "det", "all", "se3-all", "se3-random", "se3-det", "multiple", "se3-multiple"]) algorithms: asha: seed: 123 diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml index 7822609aac..0c789467b3 100644 --- a/configs/models/fanet.yaml +++ b/configs/models/fanet.yaml @@ -21,7 +21,7 @@ default: complex_mp: False edge_embed_type: rij # {'rij','all_rij','sh', 'all'}) mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'} - batch_norm: False # bool + graph_norm: False # bool att_heads: 1 # int force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True force_decoder_model_config: diff --git a/launch_exp.py b/launch_exp.py index 81547103c7..e146f816dd 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -15,13 +15,14 @@ def util_strings(jobs, yaml_comments=False): s = "All jobs launched: " + ", ".join(jobs) - s += "\nCancel experiment: scancel " + " ".join(jobs) - s += "\nWandB query for dashboard: (" + "|".join(jobs) + ")" + s += "\nCancel experiment:\n $ scancel " + " ".join(jobs) + s += "\nWandB query for dashboard:\n (" + "|".join(jobs) + ")" s += ( - "\n Delete experiment run dirs: " + "\nDelete experiment run dirs:\n $ " + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' + " ".join(jobs) - + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done' + + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid";' + + " done; unset $ocp_run_dirs; unset $jid" ) if yaml_comments: s = "\n".join(["# " + line for line in s.splitlines()]) diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index f0ac255805..092f1d8d3c 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -239,16 +239,18 @@ def __init__( mp_type, complex_mp, att_heads, - batch_norm, + graph_norm, ): super(InteractionBlock, self).__init__() self.act = act self.mp_type = mp_type self.hidden_channels = hidden_channels self.complex_mp = complex_mp - self.batch_norm = batch_norm - if batch_norm: - self.graph_norm = GraphNorm(hidden_channels) + self.graph_norm = graph_norm + if graph_norm: + self.graph_norm = GraphNorm( + hidden_channels if "updown" not in self.mp_type else num_filters + ) if self.mp_type == "simple": self.lin_geom = nn.Linear(num_filters, hidden_channels) @@ -345,19 +347,19 @@ def forward(self, h, edge_index, e): if self.mp_type == "updownscale" or self.mp_type == "updownscale_base": h = self.act(self.lin_down(h)) # downscale node rep. h = self.propagate(edge_index, x=h, W=e) # propagate - if self.batch_norm: + if self.graph_norm: h = self.act(self.graph_norm(h)) h = self.act(self.lin_up(h)) # upscale node rep. elif self.mp_type == "att": h = self.lin_geom(h, edge_index, edge_attr=e) - if self.batch_norm: + if self.graph_norm: h = self.act(self.graph_norm(h)) h = self.act(self.lin_h(h)) elif self.mp_type == "base_with_att": h = self.lin_geom(h, edge_index, edge_attr=e) # propagate is inside - if self.batch_norm: + if self.graph_norm: h = self.act(self.graph_norm(h)) h = self.act(self.lin_h(h)) @@ -365,7 +367,7 @@ def forward(self, h, edge_index, e): chi = self.propagate(edge_index, x=h, W=e, local_env=True) h = self.propagate(edge_index, x=h, W=e) # propagate h = h + chi - if self.batch_norm: + if self.graph_norm: h = self.act(self.graph_norm(h)) h = h = self.act(self.lin_h(h)) @@ -374,14 +376,14 @@ def forward(self, h, edge_index, e): chi = self.propagate(edge_index, x=h, W=e, local_env=True) e = self.lin_geom(e) h = self.propagate(edge_index, x=h, W=e) # propagate - if self.batch_norm: + if self.graph_norm: h = self.act(self.graph_norm(h)) h = torch.cat((h, chi), dim=1) h = self.lin_up(h) elif self.mp_type in {"base", "simple", "sfarinet"}: h = self.propagate(edge_index, x=h, W=e) # propagate - if self.batch_norm: + if self.graph_norm: h = self.act(self.graph_norm(h)) h = self.act(self.lin_h(h)) @@ -502,7 +504,7 @@ class FANet(BaseModel): mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env' 'updownscale_base', 'updownscale', 'updown_local_env', 'sfarinet'}}): specificies the MP of the interaction block. - batch_norm (bool): whether to apply batch norm after every linear layer. + graph_norm (bool): whether to apply batch norm after every linear layer. complex_mp (bool); whether to add a second layer MLP at the end of each Interaction """ @@ -558,7 +560,7 @@ def __init__(self, **kwargs): kwargs["mp_type"], kwargs["complex_mp"], kwargs["att_heads"], - kwargs["batch_norm"], + kwargs["graph_norm"], ) for _ in range(kwargs["num_interactions"]) ] diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py index 0f59d42311..617e54cc18 100644 --- a/scripts/gnn_dev.py +++ b/scripts/gnn_dev.py @@ -28,7 +28,7 @@ config["model"]["skip_co"] = False config["model"]["att_heads"] = 3 config["model"]["complex_mp"] = True - config["model"]["batch_norm"] = True + config["model"]["graph_norm"] = True # config["model"]["regress_forces"] = "direct_with_gradient_target" checkpoint_path = None From a057d58c8e6b07397f0aa991da4916fda4d98e19 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 14:32:16 -0500 Subject: [PATCH 089/273] add max samples option --- ocpmodels/trainers/base_trainer.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index a324f73787..357c6e1f40 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -222,6 +222,7 @@ def load_datasets(self): transform = get_transforms(self.config) # TODO: train/val/test behavior batch_size = self.config["optim"]["batch_size"] max_steps = self.config["optim"].get("max_steps", -1) + max_samples = self.config["optim"].get("max_samples", -1) for split, ds_conf in self.config["dataset"].items(): if split == "default_val": @@ -234,11 +235,28 @@ def load_datasets(self): shuffle = False if split == "train": shuffle = True - if max_steps > 0: + if max_samples > 0: if self.config["optim"].get("max_epochs", -1) > 0: print( - "WARNING: Both max_steps and max_epochs are set.", - "Using max_steps.", + "\nWARNING: Both max_samples and max_epochs are set.", + "Using max_samples.", + ) + if self.config["optim"].get("max_steps", -1) > 0: + print( + "WARNING: Both max_samples and max_steps are set.", + "Using max_samples.\n", + ) + self.config["optim"]["max_epochs"] = int( + np.ceil(max_samples / len(self.datasets[split])) + ) + self.config["optim"]["max_steps"] = int( + np.ceil(max_samples / batch_size) + ) + elif max_steps > 0: + if self.config["optim"].get("max_epochs", -1) > 0: + print( + "\nWARNING: Both max_steps and max_epochs are set.", + "Using max_steps.\n", ) self.config["optim"]["max_epochs"] = int( np.ceil(max_steps / (len(self.datasets[split]) / batch_size)) From bdc40b79ebf084af84a11fe22f934fec9678b526 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 14:55:40 -0500 Subject: [PATCH 090/273] update exp --- configs/exps/icml/is2re-10k/fanet-orion.yaml | 14 +++++++------- ocpmodels/trainers/single_trainer.py | 7 +++---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml index f073505c5e..da605c5b83 100644 --- a/configs/exps/icml/is2re-10k/fanet-orion.yaml +++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml @@ -14,34 +14,34 @@ default: mode: train test_ri: true wandb_tags: is2re-10k, orion - cp_data_to_tmpdir: true + cp_data_to_tmpdir: false graph_rewiring: remove-tag-0 + log_train_every: 20 optim: - warmup_steps: 500 + warmup_steps: 100 # parameters EMA ema_decay: 0.999 decay_steps: max_steps scheduler: LinearWarmupCosineAnnealingLR - max_epochs: -1 + batch_size: 256 note: model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm optim: batch_size, lr_initial _root_: frame_averaging, fa_frames orion_mult_factor: value: 32 - targets: batch_size, hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-is2re-10k-v1.1.0 + unique_exp_name: fanet-is2re-10k-v1.1.1 space: optim/max_epochs: fidelity(20, 100, base=4) - optim/batch_size: uniform(1, 10, discrete=True) optim/lr_initial: loguniform(1e-4, 5e-3, precision=2) - # model/graph_norm: choices([True, False]) + model/graph_norm: choices([True, False]) model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"]) model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) model/hidden_channels: uniform(1, 16, discrete=True) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 0c7109bc25..d964ffeba9 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -203,9 +203,6 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): first_eval = True log_train_every = self.config["log_train_every"] - print(f"Logging train metrics every {log_train_every} steps") - print(f"Printing train metrics every {self.config['print_every']} steps") - # Calculate start_epoch from step instead of loading the epoch number # to prevent inconsistencies due to different batch size in checkpoint. start_epoch = self.step // n_train @@ -214,7 +211,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): model_run_time = 0 if not self.silent: - print(f"--- 🔄 Beginning of Training @ {self.now}---") + print(f"--- 🔄 Beginning of Training @ {self.now}---\n") + print(f"Logging train metrics every {log_train_every} steps") + print(f"Printing train metrics every {self.config['print_every']} steps") for epoch_int in range(start_epoch, self.config["optim"]["max_epochs"]): From 18cf7337d912089cbd67c3f3b74d5831476c6499 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 15:26:41 -0500 Subject: [PATCH 091/273] fix multiple fa frames --- ocpmodels/preprocessing/frame_averaging.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocpmodels/preprocessing/frame_averaging.py b/ocpmodels/preprocessing/frame_averaging.py index 198cff6933..0dcf856c25 100644 --- a/ocpmodels/preprocessing/frame_averaging.py +++ b/ocpmodels/preprocessing/frame_averaging.py @@ -84,6 +84,7 @@ def all_frames(eigenvec, pos, cell, fa_frames="random", pos_3D=None, det_index=0 index = random.randint(0, len(all_fa_pos) - 1) return [all_fa_pos[index]], [all_cell[index]], [all_rots[index]] if index.sum() == 1: + _, index = torch.max(index, dim=0) return [all_fa_pos[index]], [all_cell[index]], [all_rots[index]] else: all_fa_pos = [a for a, b in zip(all_fa_pos, index) if b] From dcb3bf9a510a01972db2d07a212c173bb7d79503 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 15:36:37 -0500 Subject: [PATCH 092/273] fix `updownscale_base` --- configs/exps/gnn/mp_type_3.yaml | 4 ++-- ocpmodels/models/fanet.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/exps/gnn/mp_type_3.yaml b/configs/exps/gnn/mp_type_3.yaml index ec14d42dcd..f40f275ea8 100644 --- a/configs/exps/gnn/mp_type_3.yaml +++ b/configs/exps/gnn/mp_type_3.yaml @@ -135,11 +135,11 @@ runs: mp_type: sfarinet graph_norm: true - config: fanet-is2re-all - note: 'base_updownscale' + note: 'updownscale_base' frame_averaging: 2D fa_frames: se3-random model: - mp_type: base_updownscale + mp_type: updownscale_base graph_norm: true - config: fanet-is2re-all note: 'simple bigger layers' diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 092f1d8d3c..d1577e9ece 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -318,7 +318,7 @@ def reset_parameters(self): if self.complex_mp: nn.init.xavier_uniform_(self.other_mlp.weight) self.other_mlp.bias.data.fill_(0) - if self.mp_type in {"updownscale", "base_updownscale", "updown_local_env"}: + if self.mp_type in {"updownscale", "updownscale_base", "updown_local_env"}: nn.init.xavier_uniform_(self.lin_up.weight) self.lin_up.bias.data.fill_(0) nn.init.xavier_uniform_(self.lin_down.weight) From dcfe8550f8ac2d6bc5709a261da646437de563e0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 15:36:46 -0500 Subject: [PATCH 093/273] sort printed jobs --- ocpmodels/common/exp_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 75d7566975..c5321075df 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -120,13 +120,13 @@ def print_status(self): print( "{:32} : {}".format( "Jobs currently running:", - f"{len(running)} " + " ".join(running), + f"{len(running)} " + " ".join(sorted(running)), ) ) print( "{:32} : {}".format( "Jobs currently waiting:", - f"{len(waiting)} " + " ".join(waiting), + f"{len(waiting)} " + " ".join(sorted(waiting)), ) ) From 784a2a1e2cbba587635221cc8cfd4f1949db9053 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 18:11:29 -0500 Subject: [PATCH 094/273] update db path --- configs/exps/icml/is2re-10k/fanet-orion.yaml | 14 +++++++------- launch_exp.py | 3 +-- ocpmodels/common/utils.py | 6 ++++-- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml index da605c5b83..8d770b152c 100644 --- a/configs/exps/icml/is2re-10k/fanet-orion.yaml +++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml @@ -1,8 +1,8 @@ # more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij job: - mem: 24GB + mem: 8GB cpus: 4 - gres: gpu:16gb:1 + gres: gpu:1 time: 30:00 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab @@ -36,7 +36,7 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-is2re-10k-v1.1.1 + unique_exp_name: fanet-is2re-10k-v1.2.0 space: optim/max_epochs: fidelity(20, 100, base=4) @@ -44,15 +44,15 @@ orion: model/graph_norm: choices([True, False]) model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"]) model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) - model/hidden_channels: uniform(1, 16, discrete=True) + model/hidden_channels: uniform(4, 16, discrete=True) model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) model/num_filters: uniform(1, 16, discrete=True) model/num_gaussians: uniform(20, 150, discrete=True) model/num_interactions: uniform(1, 7, discrete=True) - model/pg_hidden_channels: uniform(0, 3, discrete=True) + model/pg_hidden_channels: uniform(0, 2, discrete=True) model/phys_embeds: choices([True, False]) - model/phys_hidden_channels: uniform(0, 3, discrete=True) - model/tag_hidden_channels: uniform(0, 3, discrete=True) + model/phys_hidden_channels: uniform(0, 2, discrete=True) + model/tag_hidden_channels: uniform(0, 2, discrete=True) frame_averaging: choices(["", "2D", "3D", "DA"]) fa_frames: choices(["", "random", "det", "all", "se3-all", "se3-random", "se3-det", "multiple", "se3-multiple"]) algorithms: diff --git a/launch_exp.py b/launch_exp.py index e146f816dd..da3d16c6f9 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -21,8 +21,7 @@ def util_strings(jobs, yaml_comments=False): "\nDelete experiment run dirs:\n $ " + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' + " ".join(jobs) - + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid";' - + " done; unset $ocp_run_dirs; unset $jid" + + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;' ) if yaml_comments: s = "\n".join(["# " + line for line in s.splitlines()]) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 9203819107..73d2d00925 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -106,7 +106,9 @@ def load_orion_exp(args): ), "Must provide orion_unique_exp_name in the command-line or the config file." print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}") - db_path = ROOT / "data" / "orion" / "storage" / "orion_db.pkl" + exp_name = args.orion_unique_exp_name or exp_config["unique_exp_name"] + db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."]) + db_path = ROOT / "data" / "orion" / "storage" / f"{db_id}_db.pkl" db_path.parent.mkdir(parents=True, exist_ok=True) experiment = build_experiment( storage={ @@ -115,7 +117,7 @@ def load_orion_exp(args): "type": "pickleddb", } }, - name=args.orion_unique_exp_name or exp_config["unique_exp_name"], + name=exp_name, space=exp_config["space"], algorithms=exp_config["algorithms"], ) From 578100ebf30c23e6509e7269032a0338ecc09b5f Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Fri, 13 Jan 2023 18:40:34 -0500 Subject: [PATCH 095/273] fix attention --- ocpmodels/models/fanet.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index d1577e9ece..1d2a2a3bc2 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -122,7 +122,7 @@ def __init__( elif self.edge_embed_type == "sh": self.lin_e1 = Linear(15, num_filters) elif self.edge_embed_type == "all": - self.lin_e1 = Linear(15 + num_gaussians, num_filters) + self.lin_e1 = Linear(18 + num_gaussians, num_filters) else: raise ValueError("edge_embedding_type does not exist") @@ -276,7 +276,7 @@ def __init__( hidden_channels, hidden_channels, heads=att_heads, - concat=True, + concat=False, root_weight=False, edge_dim=num_filters, ) @@ -286,7 +286,7 @@ def __init__( hidden_channels, hidden_channels, heads=att_heads, - concat=True, + concat=False, root_weight=False, edge_dim=num_filters, ) From 4578584edb71fa7e02d7afdb91b3225212f21e31 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 13 Jan 2023 19:41:43 -0500 Subject: [PATCH 096/273] remove bad defaults --- configs/exps/icml/is2re-10k/fanet-orion.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml index 8d770b152c..0f9500b973 100644 --- a/configs/exps/icml/is2re-10k/fanet-orion.yaml +++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml @@ -5,8 +5,6 @@ job: gres: gpu:1 time: 30:00 partition: long - code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab - env: ocp-a100 default: wandb_project: ocp-4 @@ -23,7 +21,7 @@ default: ema_decay: 0.999 decay_steps: max_steps scheduler: LinearWarmupCosineAnnealingLR - batch_size: 256 + batch_size: 64 note: model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm optim: batch_size, lr_initial @@ -36,7 +34,7 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-is2re-10k-v1.2.0 + unique_exp_name: fanet-is2re-10k-v1.3.0 space: optim/max_epochs: fidelity(20, 100, base=4) From a498ac8f4b20cbfb91f2ea27cdb734dd485eb875 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 10:58:35 -0500 Subject: [PATCH 097/273] add `no_metrics_denorm` flag --- configs/exps/qm7x/schnet-from-spooky.yaml | 38 +++++++++++++++++------ ocpmodels/common/flags.py | 8 +++++ ocpmodels/trainers/single_trainer.py | 16 +++++++--- 3 files changed, 49 insertions(+), 13 deletions(-) diff --git a/configs/exps/qm7x/schnet-from-spooky.yaml b/configs/exps/qm7x/schnet-from-spooky.yaml index 5c8dce6fe9..5ad2fa09e5 100644 --- a/configs/exps/qm7x/schnet-from-spooky.yaml +++ b/configs/exps/qm7x/schnet-from-spooky.yaml @@ -12,7 +12,7 @@ default: wandb_project: ocp-qm mode: train test_ri: true - wandb_tags: qm7x + wandb_tags: qm7x, no_metrics_denorm phys_hidden_channels: 0 phys_embeds: False energy_head: False @@ -20,6 +20,7 @@ default: tag_hidden_channels: 0 frame_averaging: "" cp_data_to_tmpdir: true + no_metrics_denorm: true note: task: name model: name, num_gaussians, hidden_channels, num_filters, num_interactions @@ -42,21 +43,40 @@ default: num_interactions: 6 cutoff: 5.0 -runs: +# runs: +# - optim: +# ema_decay: 0.999 +# - optim: +# scheduler: LinearWarmupCosineAnnealingLR +# model: +# cutoff: 6.0 +# - optim: +# scheduler: LinearWarmupCosineAnnealingLR +# model: +# num_gaussians: 100 +# - optim: +# scheduler: LinearWarmupCosineAnnealingLR +# model: +# num_filters: 256 +# - optim: +# ema_decay: 0.999 +# scheduler: LinearWarmupCosineAnnealingLR + +runs: # all above contributed positively to improve eval/val_ood/energy_mae. + # so we're combining them here. + test with slightly larger batch size. + # And with no_metrics_denorm. - optim: ema_decay: 0.999 - - optim: scheduler: LinearWarmupCosineAnnealingLR model: cutoff: 6.0 - - optim: - scheduler: LinearWarmupCosineAnnealingLR - model: + num_filters: 256 num_gaussians: 100 - optim: + batch_size: 32 + ema_decay: 0.999 scheduler: LinearWarmupCosineAnnealingLR model: + cutoff: 6.0 num_filters: 256 - - optim: - ema_decay: 0.999 - scheduler: LinearWarmupCosineAnnealingLR + num_gaussians: 100 \ No newline at end of file diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 519465386b..1199c344ba 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -263,6 +263,14 @@ def add_core_args(self): + " the search space MUST be the same. If it is not, the job will crash." + " If you change the search space, you must change the experiment name.", ) + self.parser.add_argument( + "--no_metrics_denorm", + type=bool, + default=False, + help="Whether or not to disable prediction denormalization to compute" + + " metrics. If True, targets are normalized instead of denormalizing " + + "preds.", + ) flags = Flags() diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index d964ffeba9..eeb1b50356 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -594,12 +594,20 @@ def compute_metrics( self.normalizer.get("normalize_labels") and "grad_target" in self.normalizers ): - preds["forces"] = self.normalizers["grad_target"].denorm( - preds["forces"] - ) + if not self.config.get("no_metrics_denorm"): + preds["forces"] = self.normalizers["grad_target"].denorm( + preds["forces"] + ) + else: + target["forces"] = self.normalizers["grad_target"].norm( + target["forces"] + ) if self.normalizer.get("normalize_labels") and "target" in self.normalizers: - preds["energy"] = self.normalizers["target"].denorm(preds["energy"]) + if not self.config.get("no_metrics_denorm"): + preds["energy"] = self.normalizers["target"].denorm(preds["energy"]) + else: + target["energy"] = self.normalizers["target"].norm(target["energy"]) metrics = evaluator.eval(preds, target, prev_metrics=metrics) From 772896c062bac622c7c0b6328f512f4879f15f9b Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 11:01:20 -0500 Subject: [PATCH 098/273] prints --- launch_exp.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index da3d16c6f9..5ecf95e743 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -25,6 +25,8 @@ def util_strings(jobs, yaml_comments=False): ) if yaml_comments: s = "\n".join(["# " + line for line in s.splitlines()]) + else: + s = "\n • ".join(s.splitlines()) return s @@ -231,7 +233,7 @@ def get_args_or_exp(key, args, exp): text += "\n<><><> Experiment config:\n\n-----" + exp_file.read_text() + "-----" text += "\n<><><> Experiment runs:\n\n • " + "\n\n • ".join(commands) + separator - confirm = input("\n🚦 Confirm? [y/n]") + confirm = input("\n🚦 Confirm? [y/n] : ") if confirm == "y": try: @@ -262,11 +264,12 @@ def get_args_or_exp(key, args, exp): text += f"{separator}All jobs launched: {' '.join(jobs)}" with outfile.open("w") as f: f.write(text) - print(f"Output written to {str(outfile)}") + print("\n🎉 Done!") + print(f" • Output written to {str(outfile)}") print(util_strings(jobs)) yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs) print( - "Experiment summary YAML in ", + " • Experiment summary YAML in ", f"./{str(yml_out.relative_to(Path.cwd()))}", ) else: From 651dd81b22501833de8f0a4b5c51eeb2fb60b066 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 11:19:21 -0500 Subject: [PATCH 099/273] allow for comments in `runs: ` line --- configs/exps/qm7x/schnet-from-spooky.yaml | 13 +++++++++++-- launch_exp.py | 21 +++++++++++---------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/configs/exps/qm7x/schnet-from-spooky.yaml b/configs/exps/qm7x/schnet-from-spooky.yaml index 5ad2fa09e5..597174bc91 100644 --- a/configs/exps/qm7x/schnet-from-spooky.yaml +++ b/configs/exps/qm7x/schnet-from-spooky.yaml @@ -1,7 +1,7 @@ # trainset has 4068193 samples job: - mem: 32GB - cpus: 8 + mem: 12GB + cpus: 4 gres: gpu:1 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 @@ -76,6 +76,15 @@ runs: # all above contributed positively to improve eval/val_ood/energy_mae. batch_size: 32 ema_decay: 0.999 scheduler: LinearWarmupCosineAnnealingLR + model: + cutoff: 6.0 + num_filters: 256 + num_gaussians: 100 + - optim: + batch_size: 512 + lr_initial: 0.0005 + ema_decay: 0.999 + scheduler: LinearWarmupCosineAnnealingLR model: cutoff: 6.0 num_filters: 256 diff --git a/launch_exp.py b/launch_exp.py index 5ecf95e743..1bb0702615 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -14,11 +14,11 @@ def util_strings(jobs, yaml_comments=False): - s = "All jobs launched: " + ", ".join(jobs) - s += "\nCancel experiment:\n $ scancel " + " ".join(jobs) - s += "\nWandB query for dashboard:\n (" + "|".join(jobs) + ")" + s = " • All jobs launched: " + ", ".join(jobs) + s += "\n • Cancel experiment:\n $ scancel " + " ".join(jobs) + s += "\n • WandB query for dashboard:\n (" + "|".join(jobs) + ")" s += ( - "\nDelete experiment run dirs:\n $ " + "\n • Delete experiment run dirs:\n $ " + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' + " ".join(jobs) + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;' @@ -26,7 +26,7 @@ def util_strings(jobs, yaml_comments=False): if yaml_comments: s = "\n".join(["# " + line for line in s.splitlines()]) else: - s = "\n • ".join(s.splitlines()) + s = "\n │ ".join(s.splitlines()) return s @@ -88,8 +88,9 @@ def write_exp_yaml_and_jobs(exp_file, outfile, jobs): jobs (list[str]): List of jobs, one per run line in the yaml exp_file """ lines = exp_file.read_text().splitlines() - if "runs:" in lines: - run_line = lines.index("runs:") + run_lines = [i for i, l in enumerate(lines) if l.strip().startswith("runs:")] + if run_lines: + run_line = run_lines[0] j = 0 for i, line in enumerate(lines[run_line:]): if line.strip().startswith("- "): @@ -244,7 +245,7 @@ def get_args_or_exp(key, args, exp): outputs = [] for c, command in enumerate(commands): - print(f"Launching job {c:3}", end="\r") + print(f"Launching job {c+1:3}", end="\r") outputs.append(os.popen(command).read().strip()) except KeyboardInterrupt: is_interrupted = True @@ -264,9 +265,9 @@ def get_args_or_exp(key, args, exp): text += f"{separator}All jobs launched: {' '.join(jobs)}" with outfile.open("w") as f: f.write(text) - print("\n🎉 Done!") - print(f" • Output written to {str(outfile)}") + print("\n\n ✅ Done!") print(util_strings(jobs)) + # print(f" • Output written to {str(outfile)}") yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs) print( " • Experiment summary YAML in ", From dc8b494cac2bddf0208486b24e77d2de7c1f03c9 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 11:37:35 -0500 Subject: [PATCH 100/273] explicit `optimizer: AdamW` in configs --- configs/models/tasks/is2re.yaml | 3 ++- configs/models/tasks/qm7x.yaml | 1 + configs/models/tasks/qm9.yaml | 3 +++ configs/models/tasks/s2ef.yaml | 2 ++ 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/configs/models/tasks/is2re.yaml b/configs/models/tasks/is2re.yaml index 059ef62c53..cf47f159de 100644 --- a/configs/models/tasks/is2re.yaml +++ b/configs/models/tasks/is2re.yaml @@ -9,7 +9,8 @@ default: metric: mae labels: - relaxed energy - + optim: + optimizer: AdamW normalizer: null model: otf_graph: False diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml index af410f6f70..98de512a2d 100644 --- a/configs/models/tasks/qm7x.yaml +++ b/configs/models/tasks/qm7x.yaml @@ -17,6 +17,7 @@ default: - total system energy optim: + optimizer: AdamW force_coefficient: 30 energy_coefficient: 1 energy_grad_coefficient: 10 diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml index 262ec232e7..e53c071188 100644 --- a/configs/models/tasks/qm9.yaml +++ b/configs/models/tasks/qm9.yaml @@ -9,6 +9,9 @@ default: use_pbc: False force_decoder_type: null + optim: + optimizer: AdamW + task: dataset: qm9 description: "QM9 U0 internal energy at 0K prediction from structure structure." diff --git a/configs/models/tasks/s2ef.yaml b/configs/models/tasks/s2ef.yaml index 92c08cdcf3..4916788b07 100644 --- a/configs/models/tasks/s2ef.yaml +++ b/configs/models/tasks/s2ef.yaml @@ -13,6 +13,8 @@ default: eval_on_free_atoms: True normalizer: null mode: train + optim: + optimizer: AdamW model: otf_graph: False max_num_neighbors: 40 From 171261127a2c2d82e81f9d67624137ac572f40cd Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 12:06:47 -0500 Subject: [PATCH 101/273] orion qm9 exp --- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 58 ++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9.yaml diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml new file mode 100644 index 0000000000..6b4f5b5877 --- /dev/null +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -0,0 +1,58 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 8GB + cpus: 4 + gres: gpu:1 + time: 30:00 + partition: long + +default: + wandb_project: ocp-4 + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, orion + log_train_every: 100 + optim: + warmup_steps: 2000 + # parameters EMA + ema_decay: 0.999 + decay_steps: max_steps + scheduler: LinearWarmupCosineAnnealingLR + batch_size: 64 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + frame_averaging: 3D + fa_frames: random + model: + edge_embed_type: all_rij + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 20 + + unique_exp_name: fanet-qm9-v1.0.0 + + space: + optim/max_epochs: fidelity(30, 300, base=6) + optim/lr_initial: loguniform(1e-4, 5e-3, precision=2) + model/graph_norm: choices([True, False]) + model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) + model/hidden_channels: uniform(5, 16, discrete=True) + model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) + model/num_filters: uniform(3, 16, discrete=True) + model/num_gaussians: uniform(20, 150, discrete=True) + model/num_interactions: uniform(1, 7, discrete=True) + model/pg_hidden_channels: uniform(0, 2, discrete=True) + model/phys_embeds: choices([True, False]) + model/tag_hidden_channels: uniform(0, 2, discrete=True) + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 From fed104f4c1da9a557815ec58b1ebeb01ade24a8f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 12:20:12 -0500 Subject: [PATCH 102/273] `IS_NARVAL` --- ocpmodels/common/logger.py | 11 +++++++---- ocpmodels/common/utils.py | 10 +++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index dab33affcc..c84734e129 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -14,6 +14,7 @@ import wandb from ocpmodels.common.registry import registry +from ocpmodels.common.utils import IS_NARVAL NTFY_OK = False try: @@ -124,14 +125,15 @@ def __init__(self, trainer_config): sbatch_files = list( Path(self.trainer_config["run_dir"]).glob("sbatch_script*.sh") ) - if len(sbatch_files) == 1: + if len(sbatch_files) == 1 and not IS_NARVAL: wandb.save(str(sbatch_files[0])) self.url = wandb.run.get_url() with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f: f.write(self.url) - self.collect_output_files(policy="live") - self.collect_output_files(policy="end") + if not IS_NARVAL: + self.collect_output_files(policy="live") + self.collect_output_files(policy="end") def watch(self, model): wandb.watch(model) @@ -169,7 +171,8 @@ def finish(self, error_or_signal=False): self.add_tags("Preempted") if error_or_signal is True: exit_code = 1 - self.collect_output_files(policy="now") + if not IS_NARVAL: + self.collect_output_files(policy="now") wandb.finish(exit_code=exit_code) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 73d2d00925..f81ba0ade5 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -42,6 +42,10 @@ OCP_TASKS = {"s2ef", "is2re", "is2es"} ROOT = Path(__file__).resolve().parent.parent.parent JOB_ID = os.environ.get("SLURM_JOB_ID") +IS_NARVAL = ( + "narval.calcul.quebec" in os.environ.get("HOSTNAME", "") + or os.environ.get("HOME") == "/home/vsch" +) def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."): @@ -303,11 +307,7 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config): def override_narval_paths(trainer_config): - is_narval = ( - "narval.calcul.quebec" in os.environ.get("HOSTNAME", "") - or os.environ.get("HOME") == "/home/vsch" - or trainer_config["narval"] - ) + is_narval = IS_NARVAL or trainer_config.get("narval") if not is_narval: return trainer_config path_overrides = yaml.safe_load( From 3960dd8ac9c947a5add2a0cd6ebabde17e83a0b0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 12:25:50 -0500 Subject: [PATCH 103/273] set default time --- sbatch.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/sbatch.py b/sbatch.py index ea97c8a7df..a7a5fa2f45 100644 --- a/sbatch.py +++ b/sbatch.py @@ -8,6 +8,11 @@ import re import yaml +IS_NARVAL = ( + "narval.calcul.quebec" in os.environ.get("HOSTNAME", "") + or os.environ.get("HOME") == "/home/vsch" +) + template = """\ #!/bin/bash {sbatch_params} @@ -212,7 +217,6 @@ def write_orion_config(args, outdir): # has the submission been successful? success = False sbatch_py_vars = {} - is_narval = "narval.calcul.quebec" in os.environ.get("HOSTNAME", "") # repository root root = Path(__file__).resolve().parent @@ -292,9 +296,12 @@ def write_orion_config(args, outdir): } if args.time: sbatch_params["time"] = args.time - if is_narval: + if IS_NARVAL: del sbatch_params["partition"] sbatch_params["account"] = "rrg-bengioy-ad_gpu" + if "time" not in sbatch_params: + print("WARNING: no time limit specified, setting to 1 day") + sbatch_params["time"] = "1-00:00:00" if "a100" in args.env: modules += ["cuda/11.2"] From fdcf667b649d8ad0420c4b9057bb2eb55b09e421 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 12:26:15 -0500 Subject: [PATCH 104/273] increase time --- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index 6b4f5b5877..0ecb00c06c 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -3,7 +3,7 @@ job: mem: 8GB cpus: 4 gres: gpu:1 - time: 30:00 + time: 02:50:00 partition: long default: From 3f9526ff88e3867115c156a1e79b1c5523c4229c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 12:27:51 -0500 Subject: [PATCH 105/273] fix seconds handling --- launch_exp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/launch_exp.py b/launch_exp.py index 1bb0702615..43c1d951ae 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -125,7 +125,10 @@ def find_exp(name): def seconds_to_time_str(seconds): - seconds = int(seconds) + try: + seconds = int(seconds) + except ValueError: + return seconds hours = seconds // 3600 minutes = (seconds % 3600) // 60 seconds = seconds % 60 From e126c67b4aafd4d0ed4c58a0d9935e53363a9c2c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 12:28:51 -0500 Subject: [PATCH 106/273] more space in prints --- launch_exp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index 43c1d951ae..764d8b535c 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -15,10 +15,10 @@ def util_strings(jobs, yaml_comments=False): s = " • All jobs launched: " + ", ".join(jobs) - s += "\n • Cancel experiment:\n $ scancel " + " ".join(jobs) + s += "\n • Cancel experiment:\n $ scancel " + " ".join(jobs) s += "\n • WandB query for dashboard:\n (" + "|".join(jobs) + ")" s += ( - "\n • Delete experiment run dirs:\n $ " + "\n • Delete experiment run dirs:\n $ " + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' + " ".join(jobs) + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;' From aabcaa8189b66dd5ed72f77da598cffb2ec7100e Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Sat, 14 Jan 2023 12:44:05 -0500 Subject: [PATCH 107/273] first config orion IS2RE --- configs/exps/icml/is2re-10k/fanet-orion.yaml | 4 +- .../exps/icml/is2re-all/fanet-orion-1.yaml | 59 +++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 configs/exps/icml/is2re-all/fanet-orion-1.yaml diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml index 8d770b152c..bd4c587b12 100644 --- a/configs/exps/icml/is2re-10k/fanet-orion.yaml +++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml @@ -5,8 +5,8 @@ job: gres: gpu:1 time: 30:00 partition: long - code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab - env: ocp-a100 + # code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab + # env: ocp-a100 default: wandb_project: ocp-4 diff --git a/configs/exps/icml/is2re-all/fanet-orion-1.yaml b/configs/exps/icml/is2re-all/fanet-orion-1.yaml new file mode 100644 index 0000000000..a39db9ec02 --- /dev/null +++ b/configs/exps/icml/is2re-all/fanet-orion-1.yaml @@ -0,0 +1,59 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + time: 10:00:00 + partition: long + +default: + wandb_project: ocp-3 + config: fanet-is2re-all + mode: train + test_ri: true + wandb_tags: is2re-all, orion + cp_data_to_tmpdir: true + graph_rewiring: remove-tag-0 + model: + edge_embed_type: all_rij + frame_averaging: 2D + fa_frames: random + optim: + scheduler: LinearWarmupCosineAnnealingLR + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co + optim: lr_initial, warmup_steps + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 50 + + unique_exp_name: fanet-is2re-all-v1 + + space: + optim/max_epochs: fidelity(8, 30, base=6) + optim/lr_initial: loguniform(1e-4, 5e-3, precision=2) + model/graph_norm: choices([True, False]) + model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) + model/hidden_channels: uniform(5, 18, discrete=True) + model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) + model/num_filters: uniform(2, 16, discrete=True) + model/num_gaussians: uniform(30, 150, discrete=True) + model/num_interactions: uniform(1, 6, discrete=True) + model/pg_hidden_channels: uniform(0, 2, discrete=True) + model/phys_embeds: choices([True, False]) + model/tag_hidden_channels: uniform(0, 2, discrete=True) + model/complex_mp: choices([True, False]) + model/att_heads: choices([1,2,3,4]) + model/second_layer_MLP: choices([True, False]) + model/skip_co: choices(["add", "concat", False]) + model/cutoff: choices([4.0, 6.0, 10.0]) + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 From 6e0b58ed9b4afd14236a86b9920ce341f8e840b0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 12:45:28 -0500 Subject: [PATCH 108/273] add qm9 narval paths --- configs/models/tasks/_narval.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/configs/models/tasks/_narval.yaml b/configs/models/tasks/_narval.yaml index 9a465c231f..9e43ec5a95 100644 --- a/configs/models/tasks/_narval.yaml +++ b/configs/models/tasks/_narval.yaml @@ -82,3 +82,12 @@ s2ef: src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both train: src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/train/ + +qm9: + all: + train: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 + val: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 + test: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 From 9ab12147a0c96436e1fdc49be256ca11813e80ed Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Sat, 14 Jan 2023 13:22:52 -0500 Subject: [PATCH 109/273] orion config s2ef --- .../exps/icml/is2re-all/fanet-orion-1.yaml | 4 +- .../exps/icml/s2ef/fanet-orion-s2ef-1.yaml | 64 +++++++++++++++++++ 2 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml diff --git a/configs/exps/icml/is2re-all/fanet-orion-1.yaml b/configs/exps/icml/is2re-all/fanet-orion-1.yaml index a39db9ec02..48068985a7 100644 --- a/configs/exps/icml/is2re-all/fanet-orion-1.yaml +++ b/configs/exps/icml/is2re-all/fanet-orion-1.yaml @@ -14,7 +14,7 @@ default: wandb_tags: is2re-all, orion cp_data_to_tmpdir: true graph_rewiring: remove-tag-0 - model: + model: edge_embed_type: all_rij frame_averaging: 2D fa_frames: random @@ -30,7 +30,7 @@ default: orion: # Remember to change the experiment name if you change anything in the search space - n_jobs: 50 + n_jobs: 166 unique_exp_name: fanet-is2re-all-v1 diff --git a/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml b/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml new file mode 100644 index 0000000000..58c5c64538 --- /dev/null +++ b/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml @@ -0,0 +1,64 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + +default: + wandb_project: ocp-3 + config: fanet-s2ef-2M + mode: train + test_ri: true + wandb_tags: s2ef-2M, orion + cp_data_to_tmpdir: true + graph_rewiring: remove-tag-0 + model: + edge_embed_type: all_rij + frame_averaging: 2D + fa_frames: random + optim: + scheduler: LinearWarmupCosineAnnealingLR + force_coefficient: 100 + energy_coefficient: 1 + energy_grad_coefficient: 5 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co + optim: lr_initial, warmup_steps + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 50 + + unique_exp_name: fanet-s2ef-2M-v1 + + space: + model/att_heads: choices([1,2,3,4]) + model/complex_mp: choices([True, False]) + model/cutoff: choices([4.0, 6.0, 10.0]) + model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) + model/graph_norm: choices([True, False]) + model/hidden_channels: uniform(6, 22, discrete=True) + model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) + model/num_filters: uniform(2, 18, discrete=True) + model/num_gaussians: uniform(30, 150, discrete=True) + model/num_interactions: uniform(3, 6, discrete=True) + model/pg_hidden_channels: uniform(0, 1, discrete=True) + model/phys_embeds: choices([True, False]) + model/regress_forces: choices(["direct_with_gradient_target", "direct"]) + model/second_layer_MLP: choices([True, False]) + model/skip_co: choices(["add", "concat", False]) + model/tag_hidden_channels: uniform(0, 2, discrete=True) + model/max_num_neighbors: choices([30,40,50]) + optim/lr_initial: loguniform(5e-5, 5e-4, precision=2) + optim/max_epochs: fidelity(6, 22, base=6) + + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 From f0c0c3cae30e8189b01e540adce30d30022d0ca2 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 13:30:17 -0500 Subject: [PATCH 110/273] small regress_forces fix --- launch_exp.py | 2 +- ocpmodels/common/utils.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/launch_exp.py b/launch_exp.py index 764d8b535c..867579ad81 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -16,7 +16,7 @@ def util_strings(jobs, yaml_comments=False): s = " • All jobs launched: " + ", ".join(jobs) s += "\n • Cancel experiment:\n $ scancel " + " ".join(jobs) - s += "\n • WandB query for dashboard:\n (" + "|".join(jobs) + ")" + s += "\n • WandB query for dashboard:\n (" + "|".join(jobs) + ")" s += ( "\n • Delete experiment run dirs:\n $ " + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index f81ba0ade5..70dd0812c0 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -974,6 +974,8 @@ def build_config(args, args_override): config["job_id"] = JOB_ID or "no-job-id" if "regress_forces" in config["model"]: + if config["model"]["regress_forces"] == "": + config["model"]["regress_forces"] = False if not isinstance(config["model"]["regress_forces"], str): if config["model"]["regress_forces"] is False: config["model"]["regress_forces"] = "" From e1304e52dea63b0e52f96753209647555a6c9b47 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 13:33:44 -0500 Subject: [PATCH 111/273] update fanet config with `use_pbc: False` for qm7x/9 --- configs/models/fanet.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml index 0c789467b3..fe94635ca1 100644 --- a/configs/models/fanet.yaml +++ b/configs/models/fanet.yaml @@ -164,6 +164,7 @@ qm7x: model: hidden_channels: 384 num_interactions: 4 + use_pbc: False optim: lr_initial: 0.001 @@ -176,3 +177,10 @@ qm7x: all: {} 1k: {} + +qm9: + default: + model: + use_pbc: False + all: {} + 10k: {} From e7318fddd251676683ce0386a37a66420b510442 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 13:38:19 -0500 Subject: [PATCH 112/273] update `parse_value` --- ocpmodels/common/utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 70dd0812c0..e6dfe3572d 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -851,6 +851,10 @@ def parse_value(value): Parse string as Python literal if possible and fallback to string. """ try: + if value.lower() == "true": + return True + elif value.lower() == "false": + return False return ast.literal_eval(value) except (ValueError, SyntaxError): # Use as string if nothing else worked From 6ec231a814dd249ede1e380950ed09cfd4020b7c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 13:50:24 -0500 Subject: [PATCH 113/273] fix `tag_hidden_channels: 0` in qm9 exp --- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index 0ecb00c06c..74031b7ff5 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -21,12 +21,12 @@ default: scheduler: LinearWarmupCosineAnnealingLR batch_size: 64 note: - model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm optim: batch_size, lr_initial _root_: frame_averaging, fa_frames orion_mult_factor: value: 32 - targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels frame_averaging: 3D fa_frames: random model: @@ -50,7 +50,6 @@ orion: model/num_interactions: uniform(1, 7, discrete=True) model/pg_hidden_channels: uniform(0, 2, discrete=True) model/phys_embeds: choices([True, False]) - model/tag_hidden_channels: uniform(0, 2, discrete=True) algorithms: asha: seed: 123 From 2e5323004c337924b436f799daffc4ca94679e5a Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 13:52:25 -0500 Subject: [PATCH 114/273] update name --- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index 74031b7ff5..a33b8bb704 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -36,7 +36,7 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-qm9-v1.0.0 + unique_exp_name: fanet-qm9-v1.0.1 space: optim/max_epochs: fidelity(30, 300, base=6) From a3d36926614899b94fb6817274b7265307137835 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 14:15:36 -0500 Subject: [PATCH 115/273] refactor to DRAC, not narval only --- configs/models/tasks/_drac.yaml | 97 +++++++++++++++++++++++++++++++ configs/models/tasks/_narval.yaml | 93 ----------------------------- launch_exp.py | 2 +- ocpmodels/common/flags.py | 6 -- ocpmodels/common/logger.py | 8 +-- ocpmodels/common/utils.py | 49 +++++++++++----- sbatch.py | 5 +- 7 files changed, 141 insertions(+), 119 deletions(-) create mode 100644 configs/models/tasks/_drac.yaml delete mode 100644 configs/models/tasks/_narval.yaml diff --git a/configs/models/tasks/_drac.yaml b/configs/models/tasks/_drac.yaml new file mode 100644 index 0000000000..f623e4d860 --- /dev/null +++ b/configs/models/tasks/_drac.yaml @@ -0,0 +1,97 @@ +# this file overrides paths for data on drac clusters +drac_base_path: + narval: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data + beluga: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data + +is2re: + 10k: + val_id: + src: _base_/oc20/is2re/all/val_id/data.lmdb + val_ood_cat: + src: _base_/oc20/is2re/all/val_ood_cat/data.lmdb + val_ood_ads: + src: _base_/oc20/is2re/all/val_ood_ads/data.lmdb + val_ood_both: + src: _base_/oc20/is2re/all/val_ood_both/data.lmdb + train: + src: _base_/oc20/is2re/10k/train/data.lmdb + 100k: + val_id: + src: _base_/oc20/is2re/all/val_id/data.lmdb + val_ood_cat: + src: _base_/oc20/is2re/all/val_ood_cat/data.lmdb + val_ood_ads: + src: _base_/oc20/is2re/all/val_ood_ads/data.lmdb + val_ood_both: + src: _base_/oc20/is2re/all/val_ood_both/data.lmdb + train: + src: _base_/oc20/is2re/100k/train/data.lmdb + + all: + val_id: + src: _base_/oc20/is2re/all/val_id/data.lmdb + val_ood_cat: + src: _base_/oc20/is2re/all/val_ood_cat/data.lmdb + val_ood_ads: + src: _base_/oc20/is2re/all/val_ood_ads/data.lmdb + val_ood_both: + src: _base_/oc20/is2re/all/val_ood_both/data.lmdb + train: + src: _base_/oc20/is2re/all/train/data.lmdb +s2ef: + 200k: + val_id: + src: _base_/oc20/s2ef/all/val_id + val_ood_cat: + src: _base_/oc20/s2ef/all/val_ood_cat + val_ood_ads: + src: _base_/oc20/s2ef/all/val_ood_ads + val_ood_both: + src: _base_/oc20/s2ef/all/val_ood_both + train: + src: _base_/oc20/s2ef/200k/train + + 2M: + val_id: + src: _base_/oc20/s2ef/all/val_id + val_ood_cat: + src: _base_/oc20/s2ef/all/val_ood_cat + val_ood_ads: + src: _base_/oc20/s2ef/all/val_ood_ads + val_ood_both: + src: _base_/oc20/s2ef/all/val_ood_both + train: + src: _base_/oc20/s2ef/2M/train/ + + 20M: + val_id: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id + val_ood_cat: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat + val_ood_ads: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads + val_ood_both: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both + train: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/20M/train/ + + all: + val_id: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id + val_ood_cat: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat + val_ood_ads: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads + val_ood_both: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both + train: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/train/ + +qm9: + all: + train: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 + val: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 + test: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 diff --git a/configs/models/tasks/_narval.yaml b/configs/models/tasks/_narval.yaml deleted file mode 100644 index 9e43ec5a95..0000000000 --- a/configs/models/tasks/_narval.yaml +++ /dev/null @@ -1,93 +0,0 @@ -# this file overrides paths for data on Narval -is2re: - 10k: - val_id: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_id/data.lmdb - val_ood_cat: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_cat/data.lmdb - val_ood_ads: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_ads/data.lmdb - val_ood_both: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_both/data.lmdb - train: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/10k/train/data.lmdb - 100k: - val_id: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_id/data.lmdb - val_ood_cat: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_cat/data.lmdb - val_ood_ads: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_ads/data.lmdb - val_ood_both: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_both/data.lmdb - train: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/100k/train/data.lmdb - - all: - val_id: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_id/data.lmdb - val_ood_cat: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_cat/data.lmdb - val_ood_ads: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_ads/data.lmdb - val_ood_both: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_both/data.lmdb - train: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/train/data.lmdb -s2ef: - 200k: - val_id: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id - val_ood_cat: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat - val_ood_ads: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads - val_ood_both: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both - train: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/200k/train - - 2M: - val_id: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id - val_ood_cat: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat - val_ood_ads: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads - val_ood_both: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both - train: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/2M/train/ - - 20M: - val_id: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id - val_ood_cat: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat - val_ood_ads: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads - val_ood_both: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both - train: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/20M/train/ - - all: - val_id: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id - val_ood_cat: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat - val_ood_ads: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads - val_ood_both: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both - train: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/train/ - -qm9: - all: - train: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 - val: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 - test: - src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 diff --git a/launch_exp.py b/launch_exp.py index 867579ad81..0d90f7b95b 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -19,7 +19,7 @@ def util_strings(jobs, yaml_comments=False): s += "\n • WandB query for dashboard:\n (" + "|".join(jobs) + ")" s += ( "\n • Delete experiment run dirs:\n $ " - + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' + + 'exp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' + " ".join(jobs) + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;' ) diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 1199c344ba..77cb140cc3 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -231,12 +231,6 @@ def add_core_args(self): type=bool, help="Evaluate on test set", ) - self.parser.add_argument( - "--narval", - action="store_true", - default=False, - help="is on Narval DRAC cluster", - ) self.parser.add_argument( "--cp_data_to_tmpdir", type=bool, diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index c84734e129..0a3cddfb5b 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -14,7 +14,7 @@ import wandb from ocpmodels.common.registry import registry -from ocpmodels.common.utils import IS_NARVAL +from ocpmodels.common.utils import CLUSTER NTFY_OK = False try: @@ -125,13 +125,13 @@ def __init__(self, trainer_config): sbatch_files = list( Path(self.trainer_config["run_dir"]).glob("sbatch_script*.sh") ) - if len(sbatch_files) == 1 and not IS_NARVAL: + if len(sbatch_files) == 1 and not CLUSTER.drac: wandb.save(str(sbatch_files[0])) self.url = wandb.run.get_url() with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f: f.write(self.url) - if not IS_NARVAL: + if not CLUSTER.drac: self.collect_output_files(policy="live") self.collect_output_files(policy="end") @@ -171,7 +171,7 @@ def finish(self, error_or_signal=False): self.add_tags("Preempted") if error_or_signal is True: exit_code = 1 - if not IS_NARVAL: + if not CLUSTER.drac: self.collect_output_files(policy="now") wandb.finish(exit_code=exit_code) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index e6dfe3572d..a84dc73fee 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -39,13 +39,28 @@ from ocpmodels.common.flags import flags from ocpmodels.common.registry import registry + +class Cluster: + def __init__(self): + self._is = { + "narval": "narval.calcul.quebec" in os.environ.get("HOSTNAME", ""), + "beluga": "beluga.calcul.quebec" in os.environ.get("HOSTNAME", ""), + "mila": "/home/mila/" in os.environ.get("HOME", ""), + } + self.name = [k for k, v in self._is.items() if v][0].capitalize() + self.Name = self.name.capitalize() + self._id["drac"] = self._is["narval"] or self._is["beluga"] + + def __getattribute__(self, k: str): + if k in self._is: + return self._is[k] + raise AttributeError("Unknown attribute " + k) + + +CLUSTER = Cluster() OCP_TASKS = {"s2ef", "is2re", "is2es"} ROOT = Path(__file__).resolve().parent.parent.parent JOB_ID = os.environ.get("SLURM_JOB_ID") -IS_NARVAL = ( - "narval.calcul.quebec" in os.environ.get("HOSTNAME", "") - or os.environ.get("HOME") == "/home/vsch" -) def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."): @@ -306,23 +321,32 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config): return trainer_config -def override_narval_paths(trainer_config): - is_narval = IS_NARVAL or trainer_config.get("narval") - if not is_narval: +def override_drac_paths(trainer_config): + if not CLUSTER.drac: return trainer_config + path_overrides = yaml.safe_load( - (ROOT / "configs" / "models" / "tasks" / "_narval.yaml").read_text() + (ROOT / "configs" / "models" / "tasks" / "_drac.yaml").read_text() ) + base_path = path_overrides["drac_base_path"][CLUSTER.name] task = trainer_config["task"]["name"] split = trainer_config["task"]["split"] - assert task in path_overrides, f"Task {task} not found in Narval paths overrides" + assert ( + task in path_overrides + ), f"Task {task} not found in {CLUSTER.Name} paths overrides" assert ( split in path_overrides[task] - ), f"Split {split} not found in Narval paths overrides for task {task}" + ), f"Split {split} not found in {CLUSTER.Name} paths overrides for task {task}" + + for t, task in copy.deepcopy(path_overrides).items(): + for sub, subset in task.items(): + for spl, split in subset.items(): + src = split["src"].replace("_base_", base_path).replace("//", "/") + path_overrides[t][sub][spl]["src"] = src print( - "Is on Narval. Overriding", + f"Is on {CLUSTER.Name}. Overriding", trainer_config["dataset"], "with", path_overrides[task][split], @@ -1002,7 +1026,7 @@ def build_config(args, args_override): config = set_qm9_target_stats(config) config = set_qm7x_target_stats(config) - config = override_narval_paths(config) + config = override_drac_paths(config) if not config["no_cpus_to_workers"]: cpus = count_cpus() @@ -1547,7 +1571,6 @@ def base_config(config, overrides={}): n, [ "run_dir=.", - "narval=", "no_qm7x_cp=true", "no_cpus_to_workers=true", "silent=", diff --git a/sbatch.py b/sbatch.py index a7a5fa2f45..2a6cfd57e1 100644 --- a/sbatch.py +++ b/sbatch.py @@ -8,8 +8,9 @@ import re import yaml -IS_NARVAL = ( +IS_DRAC = ( "narval.calcul.quebec" in os.environ.get("HOSTNAME", "") + or "beluga.calcul.quebec" in os.environ.get("HOSTNAME", "") or os.environ.get("HOME") == "/home/vsch" ) @@ -296,7 +297,7 @@ def write_orion_config(args, outdir): } if args.time: sbatch_params["time"] = args.time - if IS_NARVAL: + if IS_DRAC: del sbatch_params["partition"] sbatch_params["account"] = "rrg-bengioy-ad_gpu" if "time" not in sbatch_params: From c522a5a84087aaf841f2b17f042669e28593bc45 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 14:17:57 -0500 Subject: [PATCH 116/273] log cluster name --- ocpmodels/common/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index a84dc73fee..07d892973f 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -47,9 +47,13 @@ def __init__(self): "beluga": "beluga.calcul.quebec" in os.environ.get("HOSTNAME", ""), "mila": "/home/mila/" in os.environ.get("HOME", ""), } - self.name = [k for k, v in self._is.items() if v][0].capitalize() + self.name = [k for k, v in self._is.items() if v] + if not self.name: + self.name = "unknown" + else: + self.name = self.name[0] self.Name = self.name.capitalize() - self._id["drac"] = self._is["narval"] or self._is["beluga"] + self._is["drac"] = self._is["narval"] or self._is["beluga"] def __getattribute__(self, k: str): if k in self._is: @@ -1000,6 +1004,7 @@ def build_config(args, args_override): config["run_dir"] = resolve(config["run_dir"]) config["slurm"] = {} config["job_id"] = JOB_ID or "no-job-id" + config["cluster_name"] = CLUSTER.name if "regress_forces" in config["model"]: if config["model"]["regress_forces"] == "": From c11b4211c1bc75bb3847a88ac4a00d277fd9e276 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 15:53:22 -0500 Subject: [PATCH 117/273] error in getattribute --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 07d892973f..aa21519d31 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -55,7 +55,7 @@ def __init__(self): self.Name = self.name.capitalize() self._is["drac"] = self._is["narval"] or self._is["beluga"] - def __getattribute__(self, k: str): + def __getattr__(self, k: str): if k in self._is: return self._is[k] raise AttributeError("Unknown attribute " + k) From 3ef4445bfdcfdda70a88088e129cfacaa5503c69 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 15:54:02 -0500 Subject: [PATCH 118/273] typo --- launch_exp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch_exp.py b/launch_exp.py index 0d90f7b95b..e2c55e6eab 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -21,7 +21,7 @@ def util_strings(jobs, yaml_comments=False): "\n • Delete experiment run dirs:\n $ " + 'exp_run_dirs="$SCRATCH/ocp/runs"; for jid in ' + " ".join(jobs) - + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;' + + '; do rm -rf "$exp_run_dirs/$jid" && echo "Deleted $exp_run_dirs/$jid"; done;' ) if yaml_comments: s = "\n".join(["# " + line for line in s.splitlines()]) From 39d6c3690cd9bc40951bc8f6b9a8c7fac444aeae Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 16:51:48 -0500 Subject: [PATCH 119/273] use `CC_CLUSTER` env var --- ocpmodels/common/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index aa21519d31..bfa9d208c9 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -43,8 +43,8 @@ class Cluster: def __init__(self): self._is = { - "narval": "narval.calcul.quebec" in os.environ.get("HOSTNAME", ""), - "beluga": "beluga.calcul.quebec" in os.environ.get("HOSTNAME", ""), + "narval": os.environ.get("CC_CLUSTER") == "narval", + "beluga": os.environ.get("CC_CLUSTER") == "beluga", "mila": "/home/mila/" in os.environ.get("HOME", ""), } self.name = [k for k, v in self._is.items() if v] From fcbc47b874610579f8ae25c6d346547e08e10862 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 16:58:30 -0500 Subject: [PATCH 120/273] `pop` `drac_base_path` --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index bfa9d208c9..5ba27a16cf 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -332,7 +332,7 @@ def override_drac_paths(trainer_config): path_overrides = yaml.safe_load( (ROOT / "configs" / "models" / "tasks" / "_drac.yaml").read_text() ) - base_path = path_overrides["drac_base_path"][CLUSTER.name] + base_path = path_overrides.pop("drac_base_path")[CLUSTER.name] task = trainer_config["task"]["name"] split = trainer_config["task"]["split"] assert ( From 5c62aa467fbae8b0e6de11a9fd3e51118824ca0b Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 17:35:26 -0500 Subject: [PATCH 121/273] fix loop var leak --- launch_exp.py | 1 + ocpmodels/common/utils.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index e2c55e6eab..8ab6d86867 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -247,6 +247,7 @@ def get_args_or_exp(key, args, exp): search_path.write_text(dump(exp["orion"])) outputs = [] + print() for c, command in enumerate(commands): print(f"Launching job {c+1:3}", end="\r") outputs.append(os.popen(command).read().strip()) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 5ba27a16cf..241343d984 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -343,10 +343,10 @@ def override_drac_paths(trainer_config): split in path_overrides[task] ), f"Split {split} not found in {CLUSTER.Name} paths overrides for task {task}" - for t, task in copy.deepcopy(path_overrides).items(): - for sub, subset in task.items(): - for spl, split in subset.items(): - src = split["src"].replace("_base_", base_path).replace("//", "/") + for t, task_dict in copy.deepcopy(path_overrides).items(): + for sub, subset_dict in task_dict.items(): + for spl, split_dict in subset_dict.items(): + src = split_dict["src"].replace("_base_", base_path).replace("//", "/") path_overrides[t][sub][spl]["src"] = src print( From bb5f8779312d64ed476036ff8ede7a7a4b24e25d Mon Sep 17 00:00:00 2001 From: alexhernandezgarcia Date: Sat, 14 Jan 2023 18:06:32 -0500 Subject: [PATCH 122/273] add break line in wandb_url.txt --- configs/sbatch/alex.hernandez-garcia.yaml | 3 ++- ocpmodels/common/logger.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/configs/sbatch/alex.hernandez-garcia.yaml b/configs/sbatch/alex.hernandez-garcia.yaml index 7afa1f9a41..dfa273625e 100644 --- a/configs/sbatch/alex.hernandez-garcia.yaml +++ b/configs/sbatch/alex.hernandez-garcia.yaml @@ -1,4 +1,5 @@ # Overwrites defaults.yaml for user `schmidtv`. # Create your own $USER.yaml in order to overwrite defaults.yaml systematically to your own taste. virtualenv: True -env: /home/mila/a/alex.hernandez-garcia/.virtualenvs/ocp-torch1110cuda102 +env: /home/mila/a/alex.hernandez-garcia/.virtualenvs/ocp-torch1121cuda112 +modules: cuda/11.2, python/3.8 diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index 0a3cddfb5b..7d19ba2106 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -130,7 +130,7 @@ def __init__(self, trainer_config): self.url = wandb.run.get_url() with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f: - f.write(self.url) + f.write(self.url + "\n") if not CLUSTER.drac: self.collect_output_files(policy="live") self.collect_output_files(policy="end") From 41d71517c68fe15c04290c3c6fb8f1309df76e4c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 20:12:39 -0500 Subject: [PATCH 123/273] async cancel --- main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 5e4f99c427..1620005e9f 100644 --- a/main.py +++ b/main.py @@ -200,9 +200,9 @@ def run(self, orion_exp=None): distutils.cleanup() print("Done!") + if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read(): + print("\nSelf-canceling SLURM job in 32s", JOB_ID) + os.popen(f"sleep 32 && scancel {JOB_ID}") + if runner and runner.trainer and runner.trainer.logger: runner.trainer.logger.finish(error or signal) - - if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read(): - print("\nSelf-canceling SLURM job", JOB_ID) - os.system(f"scancel {JOB_ID}") From 63992210d89539400c1834820ee165f9b9907775 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 14 Jan 2023 20:16:58 -0500 Subject: [PATCH 124/273] update proj --- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index a33b8bb704..aea67fb2d8 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -7,7 +7,7 @@ job: partition: long default: - wandb_project: ocp-4 + wandb_project: ocp-qm config: fanet-qm9-all mode: train test_ri: true From e82bc876493d520a2acfe591346374ab50786a96 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Sun, 15 Jan 2023 12:26:45 -0500 Subject: [PATCH 125/273] config orion 2 --- .../exps/icml/is2re-all/fanet-orion-2.yaml | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 configs/exps/icml/is2re-all/fanet-orion-2.yaml diff --git a/configs/exps/icml/is2re-all/fanet-orion-2.yaml b/configs/exps/icml/is2re-all/fanet-orion-2.yaml new file mode 100644 index 0000000000..a9f9a03370 --- /dev/null +++ b/configs/exps/icml/is2re-all/fanet-orion-2.yaml @@ -0,0 +1,60 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + time: 10:00:00 + partition: long + +default: + wandb_project: ocp-3 + config: fanet-is2re-all + mode: train + test_ri: true + wandb_tags: is2re-all, orion-2 + cp_data_to_tmpdir: true + graph_rewiring: remove-tag-0 + model: + edge_embed_type: all_rij + graph_norm: True + weighted-av-final-embeds: True + frame_averaging: 2D + fa_frames: random + max_epochs_fidelity: 30 + optim: + scheduler: LinearWarmupCosineAnnealingLR + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co + optim: lr_initial, warmup_steps + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 216 + + unique_exp_name: fanet-is2re-all-v1 + + space: + optim/max_epochs: fidelity(15, 30, base=6) + optim/lr_initial: loguniform(6e-4, 4e-3, precision=2) + model/hidden_channels: uniform(8, 19, discrete=True) + model/mp_type: choices(["base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "updown_local_env"]) + model/num_filters: uniform(3, 18, discrete=True) + model/num_gaussians: uniform(50, 170, discrete=True) + model/num_interactions: uniform(3, 7, discrete=True) + model/pg_hidden_channels: uniform(0, 2, discrete=True) + model/phys_embeds: choices([True, False]) + model/tag_hidden_channels: uniform(0, 3, discrete=True) + model/complex_mp: choices([True, False]) + model/att_heads: choices([1,3,6]) + model/second_layer_MLP: choices([True, False]) + model/skip_co: choices(["add", "concat", False]) + model/cutoff: choices([4.0, 6.0, 10.0]) + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 From 29090f25cf8fb534fe58859e6c79e8b4d6643925 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 12:26:55 -0500 Subject: [PATCH 126/273] set `fidelity_max_epochs` auto and use that for steps --- main.py | 14 +++++--- ocpmodels/common/utils.py | 11 ++++-- ocpmodels/trainers/base_trainer.py | 56 ++++++++++++++++++++++++------ 3 files changed, 63 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index 1620005e9f..4d0a29dba8 100644 --- a/main.py +++ b/main.py @@ -34,6 +34,7 @@ setup_logging, unflatten_dict, update_from_sbatch_py_vars, + set_max_fidelity, ) from ocpmodels.trainers import BaseTrainer @@ -77,13 +78,16 @@ def run(self, orion_exp=None): if distutils.is_master(): if orion_exp: orion_trial = orion_exp.suggest(1) - self.hparams = unflatten_dict( - apply_mult_factor( - orion_trial.params, - self.trainer_config.get("orion_mult_factor"), + self.hparams = set_max_fidelity( + unflatten_dict( + apply_mult_factor( + orion_trial.params, + self.trainer_config.get("orion_mult_factor"), + sep="/", + ), sep="/", ), - sep="/", + orion_exp, ) self.hparams["orion_hash_params"] = orion_trial.hash_params self.hparams["orion_unique_exp_name"] = orion_exp.name diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 241343d984..d13d8a484f 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -67,6 +67,13 @@ def __getattr__(self, k: str): JOB_ID = os.environ.get("SLURM_JOB_ID") +def set_max_fidelity(hparams, orion_exp): + for p, prior in orion_exp.space.items(): + if prior.type == "fidelity": + hparams[f"fidelity_{p}"] = prior.high + return hparams + + def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."): """ Multiplies all values of orion_hparams listed in mult_factor_dict["targets"] @@ -169,9 +176,6 @@ def continue_orion_exp(trainer_config): base_dir = Path(trainer_config["run_dir"]).parent existing_id_files = list(base_dir.glob(f"*/{id_file}")) - if not existing_id_files: - return trainer_config - latest_dirs = sorted( [ f.parent @@ -182,6 +186,7 @@ def continue_orion_exp(trainer_config): ) if not latest_dirs: + print("\n😅 No previous Orion trial matched for unique file: ", id_file) return trainer_config resume_dir = latest_dirs[-1] diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 357c6e1f40..ebe3b6a8bb 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -221,8 +221,29 @@ def load_datasets(self): transform = get_transforms(self.config) # TODO: train/val/test behavior batch_size = self.config["optim"]["batch_size"] - max_steps = self.config["optim"].get("max_steps", -1) - max_samples = self.config["optim"].get("max_samples", -1) + epochs_key = ( + "max_epochs" + if "fidelity_max_epochs" not in self.config["optim"] + else "fidelity_max_epochs" + ) + steps_key = ( + "max_steps" + if "fidelity_max_steps" not in self.config["optim"] + else "fidelity_max_steps" + ) + samples_key = ( + "max_samples" + if "fidelity_max_samples" not in self.config["optim"] + else "fidelity_max_samples" + ) + max_epochs = self.config["optim"].get(epochs_key, -1) + max_steps = self.config["optim"].get(steps_key, -1) + max_samples = self.config["optim"].get(samples_key, -1) + print("Optim config auto update:") + for k, v in zip( + [epochs_key, steps_key, samples_key], [max_epochs, max_steps, max_samples] + ): + print(f" • {k}: {v}") for split, ds_conf in self.config["dataset"].items(): if split == "default_val": @@ -235,43 +256,49 @@ def load_datasets(self): shuffle = False if split == "train": shuffle = True + n_train = len(self.datasets[split]) if max_samples > 0: - if self.config["optim"].get("max_epochs", -1) > 0: + if max_epochs > 0: print( "\nWARNING: Both max_samples and max_epochs are set.", "Using max_samples.", ) - if self.config["optim"].get("max_steps", -1) > 0: + if max_steps > 0: print( "WARNING: Both max_samples and max_steps are set.", "Using max_samples.\n", ) self.config["optim"]["max_epochs"] = int( - np.ceil(max_samples / len(self.datasets[split])) + np.ceil(max_samples / n_train) ) self.config["optim"]["max_steps"] = int( np.ceil(max_samples / batch_size) ) elif max_steps > 0: - if self.config["optim"].get("max_epochs", -1) > 0: + if max_epochs > 0: print( "\nWARNING: Both max_steps and max_epochs are set.", "Using max_steps.\n", ) self.config["optim"]["max_epochs"] = int( - np.ceil(max_steps / (len(self.datasets[split]) / batch_size)) + np.ceil(max_steps / (n_train / batch_size)) ) print( "Setting max_epochs to", self.config["optim"]["max_epochs"], f"from max_steps ({max_steps}),", - f"dataset length ({len(self.datasets[split])}),", + f"dataset length ({n_train}),", f"and batch_size ({batch_size})\n", ) else: self.config["optim"]["max_steps"] = int( - self.config["optim"]["max_epochs"] - * (len(self.datasets[split]) / batch_size) + np.ceil(max_epochs * (n_train / batch_size)) + ) + print( + "Setting max_steps to ", + f"{self.config['optim']['max_steps']} from", + f"max_epochs ({max_epochs}), dataset length", + f"({n_train}), and batch_size ({batch_size})\n", ) self.samplers[split] = self.get_sampler( @@ -383,6 +410,12 @@ def load_checkpoint(self, checkpoint_path): self.optimizer.load_state_dict(checkpoint["optimizer"]) if "scheduler" in checkpoint and checkpoint["scheduler"] is not None: self.scheduler.scheduler.load_state_dict(checkpoint["scheduler"]) + if checkpoint.get("warmup_scheduler") is not None and hasattr( + self.scheduler, "warmup_scheduler" + ): + self.scheduler.warmup_scheduler.load_state_dict( + checkpoint["warmup_scheduler"] + ) if "ema" in checkpoint and checkpoint["ema"] is not None: self.ema.load_state_dict(checkpoint["ema"]) else: @@ -484,6 +517,9 @@ def save( "scheduler": self.scheduler.scheduler.state_dict() if self.scheduler.scheduler_type != "Null" else None, + "warmup_scheduler": self.scheduler.warmup_scheduler.state_dict() + if hasattr(self.scheduler, "warmup_scheduler") + else None, "normalizers": { key: value.state_dict() for key, value in self.normalizers.items() From a5ce1278cc87af7b038256e5909623d7c323f58f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 12:27:25 -0500 Subject: [PATCH 127/273] update drac paths --- configs/models/tasks/_drac.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/models/tasks/_drac.yaml b/configs/models/tasks/_drac.yaml index f623e4d860..bbfa6a9847 100644 --- a/configs/models/tasks/_drac.yaml +++ b/configs/models/tasks/_drac.yaml @@ -1,7 +1,7 @@ # this file overrides paths for data on drac clusters drac_base_path: narval: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data - beluga: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data + beluga: /scratch/vsch/ocp-data is2re: 10k: From db31ce406cf9d79a7b45b3c6ae626c63dde16ae3 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 12:36:46 -0500 Subject: [PATCH 128/273] imrpove `T_max` setting --- ocpmodels/common/utils.py | 8 +++--- ocpmodels/modules/scheduler.py | 6 ++++- ocpmodels/trainers/base_trainer.py | 41 +++++++++++++----------------- 3 files changed, 28 insertions(+), 27 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index d13d8a484f..f5c38cdc3c 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -666,9 +666,11 @@ def warmup_lr_lambda(current_step, optim_config): # exponential decay per step assert "decay_rate" in optim_config, "decay_rate must be defined in optim" ds = optim_config["decay_steps"] - if ds == "max_steps": - assert "max_steps" in optim_config, "max_steps must be defined in optim" - ds = optim_config["max_steps"] + if isinstance(ds, str): + assert ( + ds in optim_config + ), f"ds is {ds}, it must be defined in optim ({optim_config})" + ds = optim_config[ds] return optim_config["decay_rate"] ** ( (current_step - optim_config["warmup_steps"]) / ds diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index 8a4d082188..5207e2b0e3 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -44,11 +44,15 @@ def scheduler_lambda_fn(x): scheduler_args = self.filter_kwargs(self.optim_config) self.scheduler = self.scheduler(optimizer, **scheduler_args) elif self.scheduler_type == "LinearWarmupCosineAnnealingLR": + T_max = ( + self.optim_config.get("fidelity_max_steps") + or self.optim_config["max_steps"] + ) self.warmup_scheduler = warmup.ExponentialWarmup( self.optimizer, warmup_period=self.optim_config["warmup_steps"] ) self.scheduler = lr_scheduler.CosineAnnealingLR( - self.optimizer, T_max=self.optim_config["max_steps"], eta_min=1e-7 + self.optimizer, T_max=T_max, eta_min=1e-7 ) def step(self, metrics=None, epoch=None): diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index ebe3b6a8bb..b2fa37d875 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -221,29 +221,10 @@ def load_datasets(self): transform = get_transforms(self.config) # TODO: train/val/test behavior batch_size = self.config["optim"]["batch_size"] - epochs_key = ( - "max_epochs" - if "fidelity_max_epochs" not in self.config["optim"] - else "fidelity_max_epochs" - ) - steps_key = ( - "max_steps" - if "fidelity_max_steps" not in self.config["optim"] - else "fidelity_max_steps" - ) - samples_key = ( - "max_samples" - if "fidelity_max_samples" not in self.config["optim"] - else "fidelity_max_samples" - ) - max_epochs = self.config["optim"].get(epochs_key, -1) - max_steps = self.config["optim"].get(steps_key, -1) - max_samples = self.config["optim"].get(samples_key, -1) - print("Optim config auto update:") - for k, v in zip( - [epochs_key, steps_key, samples_key], [max_epochs, max_steps, max_samples] - ): - print(f" • {k}: {v}") + + max_epochs = self.config["optim"].get("max_epochs", -1) + max_steps = self.config["optim"].get("max_steps", -1) + max_samples = self.config["optim"].get("max_samples", -1) for split, ds_conf in self.config["dataset"].items(): if split == "default_val": @@ -257,6 +238,20 @@ def load_datasets(self): if split == "train": shuffle = True n_train = len(self.datasets[split]) + + if "fidelity_max_epochs" in self.config["optim"]: + self.config["optim"]["fidelity_max_steps"] = int( + np.ceil( + self.config["optim"]["fidelity_max_epochs"] + * (n_train / batch_size) + ) + ) + print( + "Setting fidelity_max_steps to {}".format( + self.config["optim"]["fidelity_max_steps"] + ) + ) + if max_samples > 0: if max_epochs > 0: print( From 649f09d844de23e23fc049402d1743d668768c93 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:06:56 -0500 Subject: [PATCH 129/273] store all job ids --- ocpmodels/common/logger.py | 3 +++ ocpmodels/common/utils.py | 1 + ocpmodels/trainers/base_trainer.py | 4 ++++ 3 files changed, 8 insertions(+) diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index 0a3cddfb5b..b3eaec103b 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -95,6 +95,7 @@ def __init__(self, trainer_config): if trainer_config.get("wandb_resume_id"): wandb_id = trainer_config["wandb_resume_id"] + print("⛑ Resuming wandb run: ", wandb_id) else: wandb_id = str(self.trainer_config.get("wandb_id", "")) if wandb_id: @@ -110,6 +111,8 @@ def __init__(self, trainer_config): note = self.trainer_config.get("note", "") name = self.trainer_config["wandb_name"] or wandb_id + print("Initializing wandb run: ", wandb_id, "with name: ", name) + self.run = wandb.init( config=self.trainer_config, id=wandb_id, diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index f5c38cdc3c..d01d2d761e 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1011,6 +1011,7 @@ def build_config(args, args_override): config["run_dir"] = resolve(config["run_dir"]) config["slurm"] = {} config["job_id"] = JOB_ID or "no-job-id" + config["job_ids"] = JOB_ID or "no-job-id" config["cluster_name"] = CLUSTER.name if "regress_forces" in config["model"]: diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index b2fa37d875..2a07f85d3f 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -422,6 +422,10 @@ def load_checkpoint(self, checkpoint_path): if self.scaler and checkpoint["amp"]: self.scaler.load_state_dict(checkpoint["amp"]) + if "config" in checkpoint: + if "job_ids" in checkpoint["config"]: + self.config["job_ids"] = checkpoint["config"]["job_ids"] + f", {JOB_ID}" + def load_loss(self): self.loss_fn = {} self.loss_fn["energy"] = self.config["optim"].get("loss_energy", "mae") From b11a1019e6d0967bb745499ebb484f9db483d59d Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:09:43 -0500 Subject: [PATCH 130/273] print which T_max is chosen --- ocpmodels/modules/scheduler.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index 5207e2b0e3..e1c203059a 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -44,10 +44,13 @@ def scheduler_lambda_fn(x): scheduler_args = self.filter_kwargs(self.optim_config) self.scheduler = self.scheduler(optimizer, **scheduler_args) elif self.scheduler_type == "LinearWarmupCosineAnnealingLR": - T_max = ( - self.optim_config.get("fidelity_max_steps") - or self.optim_config["max_steps"] - ) + T_max = self.optim_config.get("fidelity_max_steps") + if T_max is None: + T_max = self.optim_config["max_steps"] + print(f"Using max_steps for scheduler -> {T_max}") + else: + print(f"Using fidelity_max_steps for scheduler -> {T_max}") + self.warmup_scheduler = warmup.ExponentialWarmup( self.optimizer, warmup_period=self.optim_config["warmup_steps"] ) From d5701a72adc2683257cfaaccc4bae8ee1abaae79 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:13:32 -0500 Subject: [PATCH 131/273] update `schmidtv` defaults --- configs/sbatch/schmidtv.yaml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/configs/sbatch/schmidtv.yaml b/configs/sbatch/schmidtv.yaml index 942619513f..5f6dbfdf49 100644 --- a/configs/sbatch/schmidtv.yaml +++ b/configs/sbatch/schmidtv.yaml @@ -1,9 +1,5 @@ # Overwrites defaults.yaml for user `schmidtv`. # Create your own $USER.yaml in order to overwrite defaults.yaml systematically to your own taste. -cpus: 8 -mem: 128GB -env: ocp-env -gres: gpu:rtx8000:4 -partition: long +env: ocp-a100 modules: anaconda/3 From 985f07d47ac53afe3644e590d3e886cb68035bed Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:22:00 -0500 Subject: [PATCH 132/273] fix `set_max_fidelity` --- configs/exps/icml/is2re-10k/fanet-orion.yaml | 2 +- ocpmodels/common/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml index 41d5fd07b1..4dd2b528b7 100644 --- a/configs/exps/icml/is2re-10k/fanet-orion.yaml +++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml @@ -4,7 +4,7 @@ job: cpus: 4 gres: gpu:1 time: 30:00 - partition: long + partition: main # code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab # env: ocp-a100 diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index d01d2d761e..bcf5e5627e 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -70,7 +70,7 @@ def __getattr__(self, k: str): def set_max_fidelity(hparams, orion_exp): for p, prior in orion_exp.space.items(): if prior.type == "fidelity": - hparams[f"fidelity_{p}"] = prior.high + hparams[f"fidelity_{p.split('/')[-1]}"] = prior.high return hparams From b3c3e61c22099a5f4d54d02bf5f1a57a9ce32c85 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:26:23 -0500 Subject: [PATCH 133/273] nested `set_max_fidelity` --- ocpmodels/common/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index bcf5e5627e..ff8e35e4b9 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -70,7 +70,15 @@ def __getattr__(self, k: str): def set_max_fidelity(hparams, orion_exp): for p, prior in orion_exp.space.items(): if prior.type == "fidelity": - hparams[f"fidelity_{p.split('/')[-1]}"] = prior.high + keys = p.split("/") + if len(keys) == 1: + hparams[f"fidelity_{p}"] = prior.high + elif len(keys) == 2: + if keys[0] not in hparams: + hparams[keys[0]] = {} + hparams[keys[0]][f"fidelity_{keys[1]}"] = prior.high + else: + print("Error: fidelity parameters must be at most 2 levels deep.") return hparams From 9486321ea8995770ffe55c9524c18aca43264a4e Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:31:08 -0500 Subject: [PATCH 134/273] remove `max_epochs_fidelity` --- configs/exps/icml/is2re-all/fanet-orion-2.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/configs/exps/icml/is2re-all/fanet-orion-2.yaml b/configs/exps/icml/is2re-all/fanet-orion-2.yaml index a9f9a03370..ae26af2e9e 100644 --- a/configs/exps/icml/is2re-all/fanet-orion-2.yaml +++ b/configs/exps/icml/is2re-all/fanet-orion-2.yaml @@ -20,7 +20,6 @@ default: weighted-av-final-embeds: True frame_averaging: 2D fa_frames: random - max_epochs_fidelity: 30 optim: scheduler: LinearWarmupCosineAnnealingLR note: From 126f8ae852b331797a4f0dea912d69cb7d9610e3 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:36:51 -0500 Subject: [PATCH 135/273] fix exp manager with variable db path --- ocpmodels/common/exp_manager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index c5321075df..a52793f307 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -214,10 +214,6 @@ def help(self): "help": False, "name": None, "wandb_path": None, - "orion_db_path": str( - Path(__file__).resolve().parent.parent.parent - / "data/orion/storage/orion_db.pkl" - ), "watch": -1, } args = resolved_args(defaults=defaults) @@ -248,10 +244,14 @@ def help(self): "💃 Status of experiment", f"'{args.name}' and wandb entity/project '{args.wandb_path}':", ) + orion_db_path = str( + Path(__file__).resolve().parent.parent.parent + / f"data/orion/storage/{args.name}_db.pkl" + ) m = Manager( name=args.name, wandb_path=args.wandb_path, - orion_db_path=args.orion_db_path, + orion_db_path=orion_db_path, ) m.print_wandb_query() From 5b100ce0b15d84e3e2e9869cf201ba2f62592224 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:37:57 -0500 Subject: [PATCH 136/273] clean prints --- ocpmodels/common/exp_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index a52793f307..27ceb86817 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -254,9 +254,9 @@ def help(self): orion_db_path=orion_db_path, ) - m.print_wandb_query() - exp_df = m.exp.to_pandas() - reserved_wandbs = m.get_reserved_wandb_runs() + # m.print_wandb_query() + # exp_df = m.exp.to_pandas() + # reserved_wandbs = m.get_reserved_wandb_runs() if args.watch and args.watch > 0: if args.watch < 15: From e4a209844edee3da342b494d90b49cd9d670a167 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 17:59:03 -0500 Subject: [PATCH 137/273] update --- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 8 ++++---- ocpmodels/common/exp_manager.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index aea67fb2d8..983e67f951 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -36,10 +36,10 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-qm9-v1.0.1 + unique_exp_name: fanet-qm9-v1.0.2 space: - optim/max_epochs: fidelity(30, 300, base=6) + optim/max_epochs: fidelity(50, 300, base=6) optim/lr_initial: loguniform(1e-4, 5e-3, precision=2) model/graph_norm: choices([True, False]) model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) @@ -47,8 +47,8 @@ orion: model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) model/num_filters: uniform(3, 16, discrete=True) model/num_gaussians: uniform(20, 150, discrete=True) - model/num_interactions: uniform(1, 7, discrete=True) - model/pg_hidden_channels: uniform(0, 2, discrete=True) + model/num_interactions: uniform(2, 7, discrete=True) + model/pg_hidden_channels: uniform(0, 1, discrete=True) model/phys_embeds: choices([True, False]) algorithms: asha: diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 27ceb86817..be8ab8f307 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -1,7 +1,7 @@ +import wandb from orion.client import get_experiment from pathlib import Path from collections import defaultdict, Counter -import wandb from textwrap import dedent from minydra import resolved_args import os From eb05b524fe8984aad4608b6e51c1033a35ddcf7f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 18:40:38 -0500 Subject: [PATCH 138/273] update exp name --- configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml b/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml index 58c5c64538..efbe7ec3d3 100644 --- a/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml +++ b/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml @@ -34,7 +34,7 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 50 - unique_exp_name: fanet-s2ef-2M-v1 + unique_exp_name: fanet-s2ef-2M-v1.1 space: model/att_heads: choices([1,2,3,4]) From 027c8505a181f661490fcf804fefa437244e446a Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 22:21:12 -0500 Subject: [PATCH 139/273] `distutils` rename to `dist_utils` not to conflict with stdlib --- main.py | 72 +++++++++++-------- ocpmodels/common/data_parallel.py | 4 +- .../common/{distutils.py => dist_utils.py} | 0 ocpmodels/common/exp_manager.py | 7 +- ocpmodels/modules/loss.py | 6 +- ocpmodels/trainers/base_trainer.py | 50 ++++++------- ocpmodels/trainers/energy_trainer.py | 10 +-- ocpmodels/trainers/forces_trainer.py | 26 +++---- ocpmodels/trainers/single_trainer.py | 28 ++++---- 9 files changed, 111 insertions(+), 92 deletions(-) rename ocpmodels/common/{distutils.py => dist_utils.py} (100%) diff --git a/main.py b/main.py index 4d0a29dba8..d7505e383f 100644 --- a/main.py +++ b/main.py @@ -8,20 +8,22 @@ import copy import logging import os +import shutil import time import traceback import warnings import torch +from orion.core.utils.exceptions import ReservationRaceCondition from yaml import dump -from ocpmodels.common import distutils +from ocpmodels.common import dist_utils from ocpmodels.common.flags import flags from ocpmodels.common.registry import registry from ocpmodels.common.utils import ( JOB_ID, - auto_note, apply_mult_factor, + auto_note, build_config, continue_from_slurm_job_id, continue_orion_exp, @@ -30,11 +32,11 @@ move_lmdb_data_to_slurm_tmpdir, read_slurm_env, resolve, + set_max_fidelity, setup_imports, setup_logging, unflatten_dict, update_from_sbatch_py_vars, - set_max_fidelity, ) from ocpmodels.trainers import BaseTrainer @@ -75,30 +77,42 @@ def __init__(self, trainer_config): def run(self, orion_exp=None): orion_trial = None self.original_config = copy.deepcopy(self.trainer_config) - if distutils.is_master(): + orion_race_condition = False + if dist_utils.is_master(): if orion_exp: - orion_trial = orion_exp.suggest(1) - self.hparams = set_max_fidelity( - unflatten_dict( - apply_mult_factor( - orion_trial.params, - self.trainer_config.get("orion_mult_factor"), + try: + orion_trial = orion_exp.suggest(1) + print( + "\n🚨 Orion reservation race condition detected. Exiting", + "and deleting run dir", + ) + self.hparams = set_max_fidelity( + unflatten_dict( + apply_mult_factor( + orion_trial.params, + self.trainer_config.get("orion_mult_factor"), + sep="/", + ), sep="/", ), - sep="/", - ), - orion_exp, - ) - self.hparams["orion_hash_params"] = orion_trial.hash_params - self.hparams["orion_unique_exp_name"] = orion_exp.name - - should_be_0 = distutils.get_rank() - hp_list = [self.hparams, should_be_0] + orion_exp, + ) + self.hparams["orion_hash_params"] = orion_trial.hash_params + self.hparams["orion_unique_exp_name"] = orion_exp.name + except ReservationRaceCondition: + orion_race_condition = True + + should_be_0 = dist_utils.get_rank() + hp_list = [self.hparams, should_be_0, orion_race_condition] # print("hparams pre-broadcast: ", hparams) - distutils.broadcast_object_list(hp_list) - self.hparams, should_be_0 = hp_list + dist_utils.broadcast_object_list(hp_list) + self.hparams, should_be_0, orion_race_condition = hp_list # print("hparams post-broadcast: ", hparams) assert should_be_0 == 0 + if orion_race_condition: + if dist_utils.is_master(): + shutil.rmtree(self.trainer_config["run_dir"]) + return if self.hparams: print("\n💎 Received hyper-parameters from Orion:") print(dump(self.hparams), end="\n") @@ -120,7 +134,7 @@ def run(self, orion_exp=None): print("\nJob was preempted. Wrapping up...\n") self.trainer.close_datasets() - distutils.synchronize() + dist_utils.synchronize() logging.info(f"Total time taken: {time.time() - start_time}") if self.trainer.logger is not None: self.trainer.logger.log({"Total time": time.time() - start_time}) @@ -128,7 +142,7 @@ def run(self, orion_exp=None): objective = self.trainer.objective # print("objective pre-broadcast: ", objective) o_list = [objective] - distutils.broadcast_object_list(o_list) + dist_utils.broadcast_object_list(o_list) objective = o_list[0] # print("objective post-broadcast: ", objective) @@ -162,12 +176,12 @@ def run(self, orion_exp=None): original_trainer_config = copy.deepcopy(trainer_config) if args.distributed: - distutils.setup(trainer_config) + dist_utils.setup(trainer_config) print("Distributed backend setup.") - if distutils.is_master(): + if dist_utils.is_master(): trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config) - # distutils.synchronize() + # dist_utils.synchronize() # ------------------- # ----- Setup ----- @@ -183,7 +197,7 @@ def run(self, orion_exp=None): # ------------------- # ----- Train ----- # ------------------- - if args.orion_exp_config_path and distutils.is_master(): + if args.orion_exp_config_path and dist_utils.is_master(): experiment = load_orion_exp(args) print("\nStarting runner.") runner.run(orion_exp=experiment) @@ -198,10 +212,10 @@ def run(self, orion_exp=None): finally: if args.distributed: print( - "\nWaiting for all processes to finish with distutils.cleanup()...", + "\nWaiting for all processes to finish with dist_utils.cleanup()...", end="", ) - distutils.cleanup() + dist_utils.cleanup() print("Done!") if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read(): diff --git a/ocpmodels/common/data_parallel.py b/ocpmodels/common/data_parallel.py index b66b90ede2..9c57b6bc71 100644 --- a/ocpmodels/common/data_parallel.py +++ b/ocpmodels/common/data_parallel.py @@ -14,7 +14,7 @@ import torch from torch.utils.data import BatchSampler, DistributedSampler, Sampler -from ocpmodels.common import distutils +from ocpmodels.common import dist_utils from ocpmodels.datasets import data_list_collater @@ -216,7 +216,7 @@ def __iter__(self): sizes = [self.sizes[idx] for idx in batch_idx] idx_sizes = torch.stack([torch.tensor(batch_idx), torch.tensor(sizes)]) - idx_sizes_all = distutils.all_gather(idx_sizes, device=self.device) + idx_sizes_all = dist_utils.all_gather(idx_sizes, device=self.device) idx_sizes_all = torch.cat(idx_sizes_all, dim=-1).cpu() idx_all = idx_sizes_all[0] sizes_all = idx_sizes_all[1] diff --git a/ocpmodels/common/distutils.py b/ocpmodels/common/dist_utils.py similarity index 100% rename from ocpmodels/common/distutils.py rename to ocpmodels/common/dist_utils.py diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index be8ab8f307..44e56ea0fe 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -105,10 +105,15 @@ def print_status(self): ) print("{:32} : {:4}".format("Existing wandb runs", len(self.wandb_runs))) print("{:32} : {}".format("Algorithm's budgets", str(self.budgets))) + sq_cmd = ( + "/opt/slurm/bin/squeue" + if "CC_CUSTER" not in os.environ + else "/opt/software/slurm/bin/squeue" + ) sq = set( [ j.strip() - for j in os.popen("/opt/slurm/bin/squeue -u $USER -o '%12i'") + for j in os.popen(f"{sq_cmd} -u $USER -o '%12i'") .read() .splitlines()[1:] ] diff --git a/ocpmodels/modules/loss.py b/ocpmodels/modules/loss.py index 42122b5d07..cb305f0759 100644 --- a/ocpmodels/modules/loss.py +++ b/ocpmodels/modules/loss.py @@ -1,7 +1,7 @@ import torch from torch import nn -from ocpmodels.common import distutils +from ocpmodels.common import dist_utils class L2MAELoss(nn.Module): @@ -30,9 +30,9 @@ def forward(self, input: torch.Tensor, target: torch.Tensor): loss = self.loss_fn(input, target) if self.reduction == "mean": num_samples = input.shape[0] - num_samples = distutils.all_reduce(num_samples, device=input.device) + num_samples = dist_utils.all_reduce(num_samples, device=input.device) # Multiply by world size since gradients are averaged # across DDP replicas - return loss * distutils.get_world_size() / num_samples + return loss * dist_utils.get_world_size() / num_samples else: return loss diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 2a07f85d3f..52adcbdf9a 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -27,7 +27,7 @@ from torch_geometric.data import Batch from tqdm import tqdm -from ocpmodels.common import distutils +from ocpmodels.common import dist_utils from ocpmodels.common.data_parallel import ( BalancedBatchSampler, OCPDataParallel, @@ -58,7 +58,7 @@ def __init__(self, **kwargs): self.config = { **kwargs, "model_name": model_name, - "gpus": distutils.get_world_size() if not kwargs["cpu"] else 0, + "gpus": dist_utils.get_world_size() if not kwargs["cpu"] else 0, "commit": get_commit_hash(), "checkpoint_dir": str(Path(run_dir) / "checkpoints"), "results_dir": str(Path(run_dir) / "results"), @@ -94,7 +94,7 @@ def __init__(self, **kwargs): timestamp = torch.tensor(datetime.datetime.now().timestamp()).to(self.device) # create directories from master rank only - distutils.broadcast(timestamp, 0) + dist_utils.broadcast(timestamp, 0) timestamp = datetime.datetime.fromtimestamp(timestamp.int()).strftime( "%Y-%m-%d-%H-%M-%S" ) @@ -121,7 +121,7 @@ def __init__(self, **kwargs): ): self.normalizer = self.config["dataset"]["train"] - if not self.is_debug and distutils.is_master() and not self.is_hpo: + if not self.is_debug and dist_utils.is_master() and not self.is_hpo: os.makedirs(self.config["checkpoint_dir"], exist_ok=True) os.makedirs(self.config["results_dir"], exist_ok=True) os.makedirs(self.config["logs_dir"], exist_ok=True) @@ -138,7 +138,7 @@ def __init__(self, **kwargs): # default is no checkpointing self.hpo_checkpoint_every = self.config["optim"].get("checkpoint_every", -1) - if distutils.is_master() and not self.silent: + if dist_utils.is_master() and not self.silent: print("🧰 Trainer config:") print(yaml.dump(self.config), end="\n\n") self.load() @@ -174,7 +174,7 @@ def load_seed_from_config(self): def load_logger(self): self.logger = None - if not self.is_debug and distutils.is_master() and not self.is_hpo: + if not self.is_debug and dist_utils.is_master() and not self.is_hpo: assert self.config["logger"] is not None, "Specify logger in config" logger = self.config["logger"] @@ -194,8 +194,8 @@ def get_sampler(self, dataset, batch_size, shuffle): sampler = BalancedBatchSampler( dataset, batch_size=batch_size, - num_replicas=distutils.get_world_size(), - rank=distutils.get_rank(), + num_replicas=dist_utils.get_world_size(), + rank=dist_utils.get_rank(), device=self.device, mode=balancing_mode, shuffle=shuffle, @@ -355,7 +355,7 @@ def load_model(self): **self.config["model"], ).to(self.device) - if distutils.is_master() and not self.silent: + if dist_utils.is_master() and not self.silent: logging.info( f"Loaded {self.model.__class__.__name__} with " f"{self.model.num_params} parameters." @@ -369,7 +369,7 @@ def load_model(self): output_device=self.device, num_gpus=1 if not self.cpu else 0, ) - if distutils.initialized(): + if dist_utils.initialized(): self.model = DistributedDataParallel( self.model, device_ids=[self.device], output_device=self.device ) @@ -390,12 +390,12 @@ def load_checkpoint(self, checkpoint_path): # if trained with ddp and want to load in non-ddp, modify keys from # module.module.. -> module.. first_key = next(iter(checkpoint["state_dict"])) - if not distutils.initialized() and first_key.split(".")[1] == "module": + if not dist_utils.initialized() and first_key.split(".")[1] == "module": # No need for OrderedDict since dictionaries are technically ordered # since Python 3.6 and officially ordered since Python 3.7 new_dict = {k[7:]: v for k, v in checkpoint["state_dict"].items()} self.model.load_state_dict(new_dict) - elif distutils.initialized() and first_key.split(".")[1] != "module": + elif dist_utils.initialized() and first_key.split(".")[1] != "module": new_dict = {f"module.{k}": v for k, v in checkpoint["state_dict"].items()} self.model.load_state_dict(new_dict) else: @@ -439,7 +439,7 @@ def load_loss(self): self.loss_fn[loss] = L2MAELoss() else: raise NotImplementedError(f"Unknown loss function name: {loss_name}") - if distutils.initialized(): + if dist_utils.initialized(): self.loss_fn[loss] = DDPLoss(self.loss_fn[loss]) def load_optimizer(self): @@ -505,7 +505,7 @@ def save( checkpoint_file="checkpoint.pt", training_state=True, ): - if not self.is_debug and distutils.is_master(): + if not self.is_debug and dist_utils.is_master(): if training_state: save_checkpoint( { @@ -551,7 +551,7 @@ def save( ) if self.ema: self.ema.restore() - distutils.synchronize() + dist_utils.synchronize() def save_hpo(self, epoch, step, metrics, checkpoint_every): # default is no checkpointing @@ -601,7 +601,7 @@ def validate( is_final=False, is_first=False, ): - if distutils.is_master() and not self.silent: + if dist_utils.is_master() and not self.silent: print() logging.info(f"🧐 Evaluating on {split}.") if self.is_hpo: @@ -617,7 +617,7 @@ def validate( model_regresses_forces=self.config["model"].get("regress_forces", ""), ) metrics = {} - desc = "device {}".format(distutils.get_rank()) + desc = "device {}".format(dist_utils.get_rank()) loader = self.loaders[split] times = Times(gpu=True) @@ -651,10 +651,10 @@ def validate( aggregated_metrics = {} for k in metrics: aggregated_metrics[k] = { - "total": distutils.all_reduce( + "total": dist_utils.all_reduce( metrics[k]["total"], average=False, device=self.device ), - "numel": distutils.all_reduce( + "numel": dist_utils.all_reduce( metrics[k]["numel"], average=False, device=self.device ), } @@ -670,7 +670,7 @@ def validate( log_dict["model_forward_time_mean"] = mean_val_times["model_forward"] log_dict["model_forward_time_std"] = std_val_times["model_forward"] - if distutils.is_master() and not self.silent: + if dist_utils.is_master() and not self.silent: log_str = ["{}: {:.4f}".format(k, v) for k, v in log_dict.items()] print("\n > ".join([""] + log_str)) print() @@ -741,7 +741,7 @@ def save_results(self, predictions, results_file, keys): results_file_path = os.path.join( self.config["results_dir"], - f"{self.task_name}_{results_file}_{distutils.get_rank()}.npz", + f"{self.task_name}_{results_file}_{dist_utils.get_rank()}.npz", ) np.savez_compressed( results_file_path, @@ -749,15 +749,15 @@ def save_results(self, predictions, results_file, keys): **{key: predictions[key] for key in keys}, ) - distutils.synchronize() - if distutils.is_master(): + dist_utils.synchronize() + if dist_utils.is_master(): gather_results = defaultdict(list) full_path = os.path.join( self.config["results_dir"], f"{self.task_name}_{results_file}.npz", ) - for i in range(distutils.get_world_size()): + for i in range(dist_utils.get_world_size()): rank_path = os.path.join( self.config["results_dir"], f"{self.task_name}_{results_file}_{i}.npz", @@ -843,7 +843,7 @@ def eval_all_splits( } # Log specific metrics - if final and self.config["logger"] == "wandb" and distutils.is_master(): + if final and self.config["logger"] == "wandb" and dist_utils.is_master(): overall_energy_mae = cumulated_energy_mae / len(all_splits) self.logger.log({"Eval time": cumulated_time}) self.objective = overall_energy_mae diff --git a/ocpmodels/trainers/energy_trainer.py b/ocpmodels/trainers/energy_trainer.py index 40d09e5379..403dbece6f 100644 --- a/ocpmodels/trainers/energy_trainer.py +++ b/ocpmodels/trainers/energy_trainer.py @@ -13,7 +13,7 @@ import torch_geometric from tqdm import tqdm -from ocpmodels.common import distutils +from ocpmodels.common import dist_utils from ocpmodels.common.registry import registry from ocpmodels.trainers.base_trainer import BaseTrainer @@ -39,7 +39,7 @@ def load_task(self): @torch.no_grad() def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False): - if distutils.is_master() and not disable_tqdm: + if dist_utils.is_master() and not disable_tqdm: logging.info("Predicting on test.") assert isinstance( loader, @@ -48,7 +48,7 @@ def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False) torch_geometric.data.Batch, ), ) - rank = distutils.get_rank() + rank = dist_utils.get_rank() if isinstance(loader, torch_geometric.data.Batch): loader = [[loader]] @@ -320,9 +320,9 @@ def _log_metrics(self, end_of_epoch=False): ) if ( self.step % self.config["print_every"] == 0 - and distutils.is_master() + and dist_utils.is_master() and not self.is_hpo - ) or (distutils.is_master() and end_of_epoch): + ) or (dist_utils.is_master() and end_of_epoch): log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()] if not self.silent: print(", ".join(log_str)) diff --git a/ocpmodels/trainers/forces_trainer.py b/ocpmodels/trainers/forces_trainer.py index d46724f412..b8b809ec3a 100644 --- a/ocpmodels/trainers/forces_trainer.py +++ b/ocpmodels/trainers/forces_trainer.py @@ -14,7 +14,7 @@ import torch_geometric from tqdm import tqdm -from ocpmodels.common import distutils +from ocpmodels.common import dist_utils from ocpmodels.common.registry import registry from ocpmodels.common.relaxation.ml_relaxation import ml_relax from ocpmodels.common.utils import check_traj_files @@ -89,7 +89,7 @@ def predict( results_file=None, disable_tqdm=False, ): - if distutils.is_master() and not disable_tqdm: + if dist_utils.is_master() and not disable_tqdm: logging.info("Predicting on test.") assert isinstance( data_loader, @@ -98,7 +98,7 @@ def predict( torch_geometric.data.Batch, ), ) - rank = distutils.get_rank() + rank = dist_utils.get_rank() if isinstance(data_loader, torch_geometric.data.Batch): data_loader = [[data_loader]] @@ -251,7 +251,7 @@ def train(self, disable_eval_tqdm=False): ) if ( self.step % self.config["print_every"] == 0 - and distutils.is_master() + and dist_utils.is_master() and not self.is_hpo ): log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()] @@ -376,11 +376,11 @@ def compute_loss(self, out, batch_list): train_loss_force_normalizer = 3.0 * weight.sum() # add up normalizer to obtain global normalizer - distutils.all_reduce(train_loss_force_normalizer) + dist_utils.all_reduce(train_loss_force_normalizer) # perform loss normalization before backprop train_loss_force_normalized = train_loss_force_unnormalized * ( - distutils.get_world_size() / train_loss_force_normalizer + dist_utils.get_world_size() / train_loss_force_normalizer ) loss.append(train_loss_force_normalized) @@ -534,7 +534,7 @@ def run_relaxations(self, split="val"): ) if self.config["task"].get("write_pos", False): - rank = distutils.get_rank() + rank = dist_utils.get_rank() pos_filename = os.path.join( self.config["results_dir"], f"relaxed_pos_{rank}.npz" ) @@ -545,15 +545,15 @@ def run_relaxations(self, split="val"): chunk_idx=chunk_idx, ) - distutils.synchronize() - if distutils.is_master(): + dist_utils.synchronize() + if dist_utils.is_master(): gather_results = defaultdict(list) full_path = os.path.join( self.config["results_dir"], "relaxed_positions.npz", ) - for i in range(distutils.get_world_size()): + for i in range(dist_utils.get_world_size()): rank_path = os.path.join( self.config["results_dir"], f"relaxed_pos_{i}.npz", @@ -586,12 +586,12 @@ def run_relaxations(self, split="val"): aggregated_metrics = {} for k in metrics: aggregated_metrics[k] = { - "total": distutils.all_reduce( + "total": dist_utils.all_reduce( metrics[k]["total"], average=False, device=self.device, ), - "numel": distutils.all_reduce( + "numel": dist_utils.all_reduce( metrics[k]["numel"], average=False, device=self.device, @@ -611,7 +611,7 @@ def run_relaxations(self, split="val"): split=split, ) - if distutils.is_master(): + if dist_utils.is_master(): logging.info(metrics) if self.ema: diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index eeb1b50356..b4395cdcce 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -19,7 +19,7 @@ from torch_geometric.data import Data from tqdm import tqdm -from ocpmodels.common import distutils +from ocpmodels.common import dist_utils from ocpmodels.common.registry import registry from ocpmodels.common.relaxation.ml_relaxation import ml_relax from ocpmodels.common.utils import OCP_TASKS, check_traj_files @@ -94,7 +94,7 @@ def load_task(self): @torch.no_grad() def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False): - if distutils.is_master() and not disable_tqdm: + if dist_utils.is_master() and not disable_tqdm: logging.info("Predicting on test.") assert isinstance( loader, @@ -103,7 +103,7 @@ def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False) torch_geometric.data.Batch, ), ) - rank = distutils.get_rank() + rank = dist_utils.get_rank() if isinstance(loader, torch_geometric.data.Batch): loader = [[loader]] @@ -503,11 +503,11 @@ def compute_loss(self, preds, batch_list): train_loss_force_normalizer = 3.0 * weight.sum() # add up normalizer to obtain global normalizer - distutils.all_reduce(train_loss_force_normalizer) + dist_utils.all_reduce(train_loss_force_normalizer) # perform loss normalization before backprop train_loss_force_normalized = train_loss_force_unnormalized * ( - distutils.get_world_size() / train_loss_force_normalizer + dist_utils.get_world_size() / train_loss_force_normalizer ) loss.append(train_loss_force_normalized) @@ -624,9 +624,9 @@ def log_train_metrics(self, end_of_epoch=False): ) if ( self.step % self.config["print_every"] == 0 - and distutils.is_master() + and dist_utils.is_master() and not self.is_hpo - ) or (distutils.is_master() and end_of_epoch): + ) or (dist_utils.is_master() and end_of_epoch): if not self.silent: log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()] print( @@ -859,7 +859,7 @@ def run_relaxations(self, split="val"): ) if self.config["task"].get("write_pos", False): - rank = distutils.get_rank() + rank = dist_utils.get_rank() pos_filename = os.path.join( self.config["results_dir"], f"relaxed_pos_{rank}.npz" ) @@ -870,15 +870,15 @@ def run_relaxations(self, split="val"): chunk_idx=chunk_idx, ) - distutils.synchronize() - if distutils.is_master(): + dist_utils.synchronize() + if dist_utils.is_master(): gather_results = defaultdict(list) full_path = os.path.join( self.config["results_dir"], "relaxed_positions.npz", ) - for i in range(distutils.get_world_size()): + for i in range(dist_utils.get_world_size()): rank_path = os.path.join( self.config["results_dir"], f"relaxed_pos_{i}.npz", @@ -911,12 +911,12 @@ def run_relaxations(self, split="val"): aggregated_metrics = {} for k in metrics: aggregated_metrics[k] = { - "total": distutils.all_reduce( + "total": dist_utils.all_reduce( metrics[k]["total"], average=False, device=self.device, ), - "numel": distutils.all_reduce( + "numel": dist_utils.all_reduce( metrics[k]["numel"], average=False, device=self.device, @@ -936,7 +936,7 @@ def run_relaxations(self, split="val"): split=split, ) - if distutils.is_master(): + if dist_utils.is_master(): logging.info(metrics) if self.ema: From 2728eb124168bfd153556020f3d3e38fdd861a3f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 22:26:22 -0500 Subject: [PATCH 140/273] typo --- ocpmodels/common/exp_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 44e56ea0fe..f59a540919 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -107,7 +107,7 @@ def print_status(self): print("{:32} : {}".format("Algorithm's budgets", str(self.budgets))) sq_cmd = ( "/opt/slurm/bin/squeue" - if "CC_CUSTER" not in os.environ + if "CC_CLUSTER" not in os.environ else "/opt/software/slurm/bin/squeue" ) sq = set( From 6967e3ec93559df9799d8a0338257bab74aa9213 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 22:28:54 -0500 Subject: [PATCH 141/273] clean trailing line --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 5ba27a16cf..42bc745809 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -196,7 +196,7 @@ def continue_orion_exp(trainer_config): return trainer_config trainer_config["checkpoint"] = str(resume_ckpts[-1]) - resume_url = (resume_dir / "wandb_url.txt").read_text() + resume_url = (resume_dir / "wandb_url.txt").read_text().strip() trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1] print( From dfadf1a442ed350c9388db8669ac42f9e3388507 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 15 Jan 2023 23:59:17 -0500 Subject: [PATCH 142/273] use tmpdir env var --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 7c39a9d987..aefe165f58 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -310,7 +310,7 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config): print("\nMoving data to slurm tmpdir", flush=True) - tmp_dir = Path(f"/Tmp/slurm.{JOB_ID}.0") + tmp_dir = os.environ.get("SLURM_TMPDIR") or Path(f"/Tmp/slurm.{JOB_ID}.0") for s, split in trainer_config["dataset"].items(): if not isinstance(split, dict): continue From 3023d54194cb3ec2242c1ce9246bda57e76f0397 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 11:17:12 -0500 Subject: [PATCH 143/273] update --- configs/exps/icml/qm9/fanet-manual.yaml | 57 ++++++++++++++++++++++ configs/exps/icml/qm9/fanet-orion-qm9.yaml | 4 +- main.py | 17 +++++-- 3 files changed, 72 insertions(+), 6 deletions(-) create mode 100644 configs/exps/icml/qm9/fanet-manual.yaml diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml new file mode 100644 index 0000000000..b45b7e4587 --- /dev/null +++ b/configs/exps/icml/qm9/fanet-manual.yaml @@ -0,0 +1,57 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 12GB + cpus: 4 + gres: gpu:16gb:1 + partition: main + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, fanet-qm9-v1.0.2-continued + log_train_every: 100 + optim: + warmup_steps: 2000 + # parameters EMA + ema_decay: 0.999 + decay_steps: max_steps + scheduler: LinearWarmupCosineAnnealingLR + batch_size: 64 + initial_lr: 0.001 + max_epochs: 1500 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels + frame_averaging: 3D + fa_frames: random + model: + edge_embed_type: all_rij + energy_head: weighted-av-initial-embeds + graph_norm: True + hidden_channels: 416 + max_num_neighbors: 40 + mp_type: updownscale + num_filters: 512 + num_gaussians: 100 + num_interactions: 3 + otf_graph: false + pg_hidden_channels: 0 + phys_embeds: false + phys_hidden_channels: 0 + second_layer_MLP: false + skip_co: false + tag_hidden_channels: 0 + use_pbc: false + regress_forces: "" + + +runs: + - {} + - model: + mp_type: base_with_att \ No newline at end of file diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index 983e67f951..97aa0a69bf 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -36,10 +36,10 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-qm9-v1.0.2 + unique_exp_name: fanet-qm9-v2.0.0 space: - optim/max_epochs: fidelity(50, 300, base=6) + optim/max_epochs: fidelity(100, 2000, base=5) optim/lr_initial: loguniform(1e-4, 5e-3, precision=2) model/graph_norm: choices([True, False]) model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) diff --git a/main.py b/main.py index d7505e383f..67fb6b6546 100644 --- a/main.py +++ b/main.py @@ -101,6 +101,13 @@ def run(self, orion_exp=None): self.hparams["orion_unique_exp_name"] = orion_exp.name except ReservationRaceCondition: orion_race_condition = True + import wandb + + if wandb.run is not None: + if wandb.run.tags: + wandb.run.tags = wandb.run.tags + ("RaceCondition",) + else: + wandb.run.tags = ("RaceCondition",) should_be_0 = dist_utils.get_rank() hp_list = [self.hparams, should_be_0, orion_race_condition] @@ -147,10 +154,12 @@ def run(self, orion_exp=None): # print("objective post-broadcast: ", objective) if orion_exp is not None: - orion_exp.observe( - orion_trial, - [{"type": "objective", "name": "energy_mae", "value": objective}], - ) + if objective is not None: + orion_exp.observe( + orion_trial, + [{"type": "objective", "name": "energy_mae", "value": objective}], + ) + print("Received None objective from worker. Skipping observation.") if __name__ == "__main__": From 42e2f48483a84c35d892b9f5d26ed943ac72b9fc Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 11:19:53 -0500 Subject: [PATCH 144/273] hotfix tmp dir --- ocpmodels/common/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index aefe165f58..edade03673 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -310,7 +310,8 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config): print("\nMoving data to slurm tmpdir", flush=True) - tmp_dir = os.environ.get("SLURM_TMPDIR") or Path(f"/Tmp/slurm.{JOB_ID}.0") + tmp_dir = os.environ.get("SLURM_TMPDIR") or f"/Tmp/slurm.{JOB_ID}.0" + tmp_dir = Path(tmp_dir) for s, split in trainer_config["dataset"].items(): if not isinstance(split, dict): continue From c0d08ab5b2d6402f6fe021ebf819d905f3fd9d01 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 14:02:20 -0500 Subject: [PATCH 145/273] beluga wandb offline --- sbatch.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sbatch.py b/sbatch.py index 2a6cfd57e1..0d6affdfbe 100644 --- a/sbatch.py +++ b/sbatch.py @@ -38,7 +38,7 @@ else conda activate {env} fi - +{wandb_offline} srun --output={output} {python_command} """ @@ -217,6 +217,7 @@ def write_orion_config(args, outdir): if __name__ == "__main__": # has the submission been successful? success = False + wandb_offline = "" sbatch_py_vars = {} # repository root @@ -307,6 +308,9 @@ def write_orion_config(args, outdir): if "a100" in args.env: modules += ["cuda/11.2"] + if os.environ.get("CC_CLUSTER") == "beluga": + wandb_offline = "wandb offline\necho 'wandb offline'" + # format string template with defaults + command-line args script = template.format( code_loc=(str(resolve(args.code_loc)) if args.code_loc else str(root)), @@ -322,6 +326,7 @@ def write_orion_config(args, outdir): sbatch_params=make_sbatch_params(sbatch_params), sbatch_py_vars=make_sbatch_py_vars(sbatch_py_vars), virtualenv=virtualenv, + wandb_offline=wandb_offline, ) # default script path to execute `sbatch {script_path}/script_{now()}.sh` From 96e476c8645bff70581da01edc74a9f6a5f5e5ca Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 15:00:41 -0500 Subject: [PATCH 146/273] avoir url file on Beluga --- ocpmodels/common/logger.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index 0a8a432719..380401332c 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -132,8 +132,9 @@ def __init__(self, trainer_config): wandb.save(str(sbatch_files[0])) self.url = wandb.run.get_url() - with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f: - f.write(self.url + "\n") + if self.url: + with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f: + f.write(self.url + "\n") if not CLUSTER.drac: self.collect_output_files(policy="live") self.collect_output_files(policy="end") From 5e6bf83d918d9ee35d6d27571948a282e44f28f1 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 15:23:23 -0500 Subject: [PATCH 147/273] report 1e12 if Nan --- main.py | 24 ++++++++++-------------- ocpmodels/common/dist_utils.py | 12 +++++++++--- ocpmodels/trainers/single_trainer.py | 2 +- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index 67fb6b6546..b3f02d599b 100644 --- a/main.py +++ b/main.py @@ -109,13 +109,9 @@ def run(self, orion_exp=None): else: wandb.run.tags = ("RaceCondition",) - should_be_0 = dist_utils.get_rank() - hp_list = [self.hparams, should_be_0, orion_race_condition] - # print("hparams pre-broadcast: ", hparams) - dist_utils.broadcast_object_list(hp_list) - self.hparams, should_be_0, orion_race_condition = hp_list - # print("hparams post-broadcast: ", hparams) - assert should_be_0 == 0 + self.hparams, orion_race_condition = dist_utils.broadcast_from_master( + self.hparams, orion_race_condition + ) if orion_race_condition: if dist_utils.is_master(): shutil.rmtree(self.trainer_config["run_dir"]) @@ -146,20 +142,20 @@ def run(self, orion_exp=None): if self.trainer.logger is not None: self.trainer.logger.log({"Total time": time.time() - start_time}) - objective = self.trainer.objective - # print("objective pre-broadcast: ", objective) - o_list = [objective] - dist_utils.broadcast_object_list(o_list) - objective = o_list[0] - # print("objective post-broadcast: ", objective) + objective = dist_utils.broadcast_from_master(self.trainer.objective) if orion_exp is not None: + if objective is None: + if signal == "loss_is_nan": + objective = 1e12 + print("Received NaN objective from worker. Setting to 1e12.") + else: + print("Received None objective from worker. Skipping observation.") if objective is not None: orion_exp.observe( orion_trial, [{"type": "objective", "name": "energy_mae", "value": objective}], ) - print("Received None objective from worker. Skipping observation.") if __name__ == "__main__": diff --git a/ocpmodels/common/dist_utils.py b/ocpmodels/common/dist_utils.py index d4f4c13894..024b98e280 100644 --- a/ocpmodels/common/dist_utils.py +++ b/ocpmodels/common/dist_utils.py @@ -95,15 +95,21 @@ def broadcast(tensor, src, group=dist.group.WORLD, async_op=False): dist.broadcast(tensor, src, group, async_op) -def broadcast_object_list(obj_list, src=0): +def broadcast_from_master(*obj_list): if get_world_size() == 1: - return + if len(obj_list) == 1: + return obj_list[0] + return obj_list + obj_list = list(obj_list) dist.broadcast_object_list( obj_list, - src=src, + src=0, group=dist.group.WORLD, device=torch.device(f"cuda:{get_rank()}"), ) + if len(obj_list) == 1: + return obj_list[0] + return obj_list def all_reduce(data, group=dist.group.WORLD, average=False, device=None): diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index b4395cdcce..2b43628a61 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -262,7 +262,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): if torch.isnan(loss["total_loss"]): print("\n\n >>> 🛑 Loss is NaN. Stopping training.\n\n") self.logger.add_tags(["nan_loss"]) - return True + return "loss_is_nan" self._backward(loss) # Compute metrics. From cad68010d585295381a8c318c5335ba582b3bda9 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 16:03:25 -0500 Subject: [PATCH 148/273] clearer prints --- ocpmodels/tasks/task.py | 2 ++ ocpmodels/trainers/base_trainer.py | 1 + 2 files changed, 3 insertions(+) diff --git a/ocpmodels/tasks/task.py b/ocpmodels/tasks/task.py index 0bbd72de32..bcef87446f 100644 --- a/ocpmodels/tasks/task.py +++ b/ocpmodels/tasks/task.py @@ -19,7 +19,9 @@ def __init__(self, config): def setup(self, trainer): self.trainer = trainer if self.config.get("checkpoint") is not None: + print("\n🔵 Resuming:\n • ", end="", flush=True) self.trainer.load_checkpoint(self.config["checkpoint"]) + print() # save checkpoint path to runner state for slurm resubmissions self.chkpt_path = os.path.join( diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 52adcbdf9a..244ddfd317 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -894,6 +894,7 @@ def eval_all_splits( console = Console() console.print(table) print() + print("\n• Trainer objective set to:", self.objective, end="\n\n") def rotate_graph(self, batch, rotation=None): """Rotate all graphs in a batch From cc3ed561729a0573496750a176724f66093e1084 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 18:05:40 -0500 Subject: [PATCH 149/273] Early stop from min_lr --- ocpmodels/modules/scheduler.py | 41 ++++++++++++++++++++++++---- ocpmodels/trainers/base_trainer.py | 6 +++- ocpmodels/trainers/single_trainer.py | 4 ++- 3 files changed, 44 insertions(+), 7 deletions(-) diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index e1c203059a..e93fcf78a7 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -100,13 +100,36 @@ class EarlyStopper: """ def __init__( - self, patience=7, mode="min", min_abs_change=1e-5, store_all_steps=True + self, + patience=7, + mode="min", + min_abs_change=1e-5, + store_all_steps=True, + min_lr=-1, ): + """ + Whether train should stop or not. + + Args: + patience (int, optional): How many calls to `should_stop` with no + improvement before stopping training. Defaults to 7. + mode (str, optional): "min" or "max". Defaults to "min". + min_abs_change (float, optional): Minimum metric change to be considered an + improvement. Defaults to 1e-5. + store_all_steps (bool, optional): Whether to store all metrics passed to + `should_stop` or only the last `patience` ones. Defaults to True. + min_lr (bool, optional): Whether to stop when the current learning rate + reaches the . Defaults to -1. + + Raises: + ValueError: Unknown mode (neither min nor max) + """ self.patience = patience self.mode = mode self.counter = 0 self.min_abs_change = min_abs_change self.store_all_steps = store_all_steps + self.min_lr = min_lr self.metrics = [] if self.mode == "min": @@ -116,12 +139,17 @@ def __init__( else: raise ValueError("mode must be either min or max") - self.early_stop = False + self.early_stop = "" - def should_stop(self, metric): + def should_stop(self, metric, lr=None): """ - Returns True if the metric has not improved for a certain number of - steps. False otherwise. Stores the metric in `self.metrics`: all the steps if + Returns why the training should stop: + • Empty string if the training shouldn't stop + • "metric" if the metric has not improved for a certain number of + steps. + • "lr" if the learning rate has reached the minimum value. + + Stores the metric in `self.metrics`: all the steps if `self.store_all_steps` is `True`, otherwise only the last `n=self.patience`. Args: @@ -151,6 +179,9 @@ def should_stop(self, metric): if self.counter >= self.patience: self.early_stop = True + if lr is not None and lr <= self.min_lr: + self.early_stop = True + return self.early_stop @property diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 244ddfd317..8500676f1a 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -83,7 +83,11 @@ def __init__(self, **kwargs): self.datasets = {} self.samplers = {} self.loaders = {} - self.early_stopper = EarlyStopper(patience=10, min_abs_change=1e-5) + self.early_stopper = EarlyStopper( + patience=10, + min_abs_change=1e-5, + min_lr=self.config["optim"].get("min_lr", -1), + ) if torch.cuda.is_available() and not self.cpu: self.device = torch.device(f"cuda:{self.config['local_rank']}") diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 2b43628a61..7fcdc9cd39 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -330,7 +330,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): checkpoint_file="best_checkpoint.pt", training_state=False, ) - if self.early_stopper.should_stop(current_val_metric): + if self.early_stopper.should_stop( + current_val_metric, self.scheduler.get_lr() + ): print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n") if self.logger: self.logger.add_tags(["E-S"]) From 2c067ab77295149f6e8e06c83fbf9c3a64a2b460 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 18:05:57 -0500 Subject: [PATCH 150/273] update configs --- configs/exps/icml/qm9/fanet-manual.yaml | 43 ++++++++++++++++--------- configs/exps/qm7x/schnet.yaml | 37 ++++++++++++--------- configs/models/tasks/qm7x.yaml | 4 ++- 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml index b45b7e4587..91fc451cbb 100644 --- a/configs/exps/icml/qm9/fanet-manual.yaml +++ b/configs/exps/icml/qm9/fanet-manual.yaml @@ -3,7 +3,7 @@ job: mem: 12GB cpus: 4 gres: gpu:16gb:1 - partition: main + partition: long default: wandb_project: ocp-qm @@ -12,15 +12,6 @@ default: test_ri: true wandb_tags: qm9, fanet-qm9-v1.0.2-continued log_train_every: 100 - optim: - warmup_steps: 2000 - # parameters EMA - ema_decay: 0.999 - decay_steps: max_steps - scheduler: LinearWarmupCosineAnnealingLR - batch_size: 64 - initial_lr: 0.001 - max_epochs: 1500 note: model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm optim: batch_size, lr_initial @@ -30,19 +21,29 @@ default: targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels frame_averaging: 3D fa_frames: random + optim: + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + decay_steps: max_steps + scheduler: LinearWarmupCosineAnnealingLR + batch_size: 64 + initial_lr: 0.001 + max_epochs: 1500 model: + cutoff: 5.0 edge_embed_type: all_rij energy_head: weighted-av-initial-embeds graph_norm: True hidden_channels: 416 max_num_neighbors: 40 mp_type: updownscale - num_filters: 512 - num_gaussians: 100 - num_interactions: 3 + num_filters: 256 + num_gaussians: 50 + num_interactions: 5 otf_graph: false - pg_hidden_channels: 0 - phys_embeds: false + pg_hidden_channels: 32 + phys_embeds: true phys_hidden_channels: 0 second_layer_MLP: false skip_co: false @@ -54,4 +55,14 @@ default: runs: - {} - model: - mp_type: base_with_att \ No newline at end of file + mp_type: base_with_att + - model: + cutoff: 6.0 + - optim: + initial_lr: 0.0005 + - optim: + batch_size: 128 + - optim: + batch_size: 512 + - model: + energy_head: "" diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml index 73e5ace742..9282c3ffe2 100644 --- a/configs/exps/qm7x/schnet.yaml +++ b/configs/exps/qm7x/schnet.yaml @@ -25,42 +25,47 @@ default: model: name, num_gaussians, hidden_channels, num_filters, num_interactions optim: batch_size, lr_initial optim: - batch_size: 2048 + batch_size: 512 warmup_steps: 1000 lr_initial: 0.0005 # parameters EMA ema_decay: 0.999 - decay_steps: 750000 - decay_rate: 0.05 - max_steps: 200000 + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + loss_energy: mse + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.5 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.00001 + verbose: true model: hidden_channels: 256 - num_filters: 256 + num_filters: 128 num_gaussians: 100 num_interactions: 6 cutoff: 5.0 + regress_forces: "from_energy" runs: - {} - optim: - batch_size: 2048 + lr_initial: 0.001 + batch_size: 1024 - optim: - batch_size: 4096 + batch_size: 256 - optim: lr_initial: 0.001 - optim: lr_initial: 0.001 batch_size: 2048 - model: - num_gaussians: 200 - - model: - hidden_channels: 1024 - - model: - num_filters: 1024 - - model: - num_interactions: 8 + hidden_channels: 512 - model: num_interactions: 6 num_gaussians: 20 - num_filters: 64 - hidden_channels: 1024 \ No newline at end of file + hidden_channels: 512 \ No newline at end of file diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml index 98de512a2d..0a6fa33094 100644 --- a/configs/models/tasks/qm7x.yaml +++ b/configs/models/tasks/qm7x.yaml @@ -18,9 +18,11 @@ default: optim: optimizer: AdamW - force_coefficient: 30 energy_coefficient: 1 energy_grad_coefficient: 10 + force_coefficient: 100 + loss_energy: mse + loss_force: mse normalizer: null graph_rewiring: "" From 3f572cd83176e444e5d106208c6919affa972aa2 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 18:11:21 -0500 Subject: [PATCH 151/273] handle float eval_every --- configs/exps/qm7x/schnet.yaml | 2 ++ ocpmodels/trainers/single_trainer.py | 7 +++++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml index 9282c3ffe2..a62f41b469 100644 --- a/configs/exps/qm7x/schnet.yaml +++ b/configs/exps/qm7x/schnet.yaml @@ -28,6 +28,7 @@ default: batch_size: 512 warmup_steps: 1000 lr_initial: 0.0005 + eval_every: 0.34 # parameters EMA ema_decay: 0.999 energy_coefficient: 1 @@ -35,6 +36,7 @@ default: force_coefficient: 100 loss_energy: mse loss_force: mse + eval_every: # all below is for the scheduler scheduler: ReduceLROnPlateau mode: min diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 7fcdc9cd39..93306633ad 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -193,6 +193,8 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): n_train = len(self.loaders["train"]) epoch_int = 0 eval_every = self.config["optim"].get("eval_every", n_train) + if eval_every < 1: + eval_every = int(n_train * eval_every) if self.config["print_every"] < 0: self.config["print_every"] = n_train primary_metric = self.config["task"].get( @@ -211,9 +213,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): model_run_time = 0 if not self.silent: - print(f"--- 🔄 Beginning of Training @ {self.now}---\n") - print(f"Logging train metrics every {log_train_every} steps") + print(f"\n--- 🔄 Beginning of Training @ {self.now}---\n") + print(f"\nLogging train metrics every {log_train_every} steps") print(f"Printing train metrics every {self.config['print_every']} steps") + print(f"\nEvaluating every {eval_every} steps\n") for epoch_int in range(start_epoch, self.config["optim"]["max_epochs"]): From 6c8c48ef85c50c47fdf5d7aee6438b294efd41fc Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 23:00:21 -0500 Subject: [PATCH 152/273] increase ES patience --- configs/exps/icml/qm9/fanet-manual.yaml | 12 +++++++++++- configs/exps/qm7x/schnet.yaml | 3 +-- ocpmodels/trainers/base_trainer.py | 2 +- ocpmodels/trainers/single_trainer.py | 2 +- 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml index 91fc451cbb..6830f52f21 100644 --- a/configs/exps/icml/qm9/fanet-manual.yaml +++ b/configs/exps/icml/qm9/fanet-manual.yaml @@ -1,4 +1,4 @@ -# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +# scheduler reduce lr on plateau job: mem: 12GB cpus: 4 @@ -30,6 +30,16 @@ default: batch_size: 64 initial_lr: 0.001 max_epochs: 1500 + loss_energy: mse + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.5 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.00001 + verbose: true model: cutoff: 5.0 edge_embed_type: all_rij diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml index a62f41b469..b5ecf97358 100644 --- a/configs/exps/qm7x/schnet.yaml +++ b/configs/exps/qm7x/schnet.yaml @@ -1,6 +1,6 @@ # trainset has 4068193 samples job: - mem: 48GB + mem: 32GB cpus: 8 gres: gpu:16gb:1 partition: long @@ -36,7 +36,6 @@ default: force_coefficient: 100 loss_energy: mse loss_force: mse - eval_every: # all below is for the scheduler scheduler: ReduceLROnPlateau mode: min diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 8500676f1a..e8917a389e 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -84,7 +84,7 @@ def __init__(self, **kwargs): self.samplers = {} self.loaders = {} self.early_stopper = EarlyStopper( - patience=10, + patience=15, min_abs_change=1e-5, min_lr=self.config["optim"].get("min_lr", -1), ) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 93306633ad..5a6ed887b1 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -192,7 +192,7 @@ def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False) def train(self, disable_eval_tqdm=True, debug_batches=-1): n_train = len(self.loaders["train"]) epoch_int = 0 - eval_every = self.config["optim"].get("eval_every", n_train) + eval_every = self.config["optim"].get("eval_every", n_train) or n_train if eval_every < 1: eval_every = int(n_train * eval_every) if self.config["print_every"] < 0: From 6e242cff339e938901cf63bfa49b5c3c0bfe2611 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 23:29:23 -0500 Subject: [PATCH 153/273] fix warmup scheduler state dict --- configs/exps/icml/qm9/fanet-manual.yaml | 8 ++--- ocpmodels/trainers/base_trainer.py | 43 +++++++++++++------------ 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml index 6830f52f21..0fc5b23986 100644 --- a/configs/exps/icml/qm9/fanet-manual.yaml +++ b/configs/exps/icml/qm9/fanet-manual.yaml @@ -43,9 +43,9 @@ default: model: cutoff: 5.0 edge_embed_type: all_rij - energy_head: weighted-av-initial-embeds + energy_head: weighted-av-final-embeds graph_norm: True - hidden_channels: 416 + hidden_channels: 350 max_num_neighbors: 40 mp_type: updownscale num_filters: 256 @@ -68,11 +68,11 @@ runs: mp_type: base_with_att - model: cutoff: 6.0 - - optim: - initial_lr: 0.0005 - optim: batch_size: 128 - optim: batch_size: 512 - model: energy_head: "" + - model: + energy_head: "weighted-av-initial-embeds" diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index e8917a389e..8830a29c01 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -511,27 +511,30 @@ def save( ): if not self.is_debug and dist_utils.is_master(): if training_state: - save_checkpoint( - { - "epoch": self.epoch, - "step": self.step, - "state_dict": self.model.state_dict(), - "optimizer": self.optimizer.state_dict(), - "scheduler": self.scheduler.scheduler.state_dict() - if self.scheduler.scheduler_type != "Null" - else None, - "warmup_scheduler": self.scheduler.warmup_scheduler.state_dict() - if hasattr(self.scheduler, "warmup_scheduler") - else None, - "normalizers": { - key: value.state_dict() - for key, value in self.normalizers.items() - }, - "config": self.config, - "val_metrics": metrics, - "ema": self.ema.state_dict() if self.ema else None, - "amp": self.scaler.state_dict() if self.scaler else None, + ckpt_dict = { + "epoch": self.epoch, + "step": self.step, + "state_dict": self.model.state_dict(), + "optimizer": self.optimizer.state_dict(), + "scheduler": self.scheduler.scheduler.state_dict() + if self.scheduler.scheduler_type != "Null" + else None, + "normalizers": { + key: value.state_dict() + for key, value in self.normalizers.items() }, + "config": self.config, + "val_metrics": metrics, + "ema": self.ema.state_dict() if self.ema else None, + "amp": self.scaler.state_dict() if self.scaler else None, + } + if self.scheduler.warmup_scheduler is not None: + ckpt_dict[ + "warmup_scheduler" + ] = self.scheduler.warmup_scheduler.state_dict() + + save_checkpoint( + ckpt_dict, checkpoint_dir=self.config["checkpoint_dir"], checkpoint_file=checkpoint_file, ) From 76cac140221b01aa4ebf68867c27830ef741fb86 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 16 Jan 2023 23:30:25 -0500 Subject: [PATCH 154/273] fix load warmup_scheduler --- ocpmodels/trainers/base_trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 8830a29c01..e22a648d03 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -409,8 +409,9 @@ def load_checkpoint(self, checkpoint_path): self.optimizer.load_state_dict(checkpoint["optimizer"]) if "scheduler" in checkpoint and checkpoint["scheduler"] is not None: self.scheduler.scheduler.load_state_dict(checkpoint["scheduler"]) - if checkpoint.get("warmup_scheduler") is not None and hasattr( - self.scheduler, "warmup_scheduler" + if ( + checkpoint.get("warmup_scheduler") is not None + and self.scheduler.warmup_scheduler is not None ): self.scheduler.warmup_scheduler.load_state_dict( checkpoint["warmup_scheduler"] From a2f4d373d4583850f6e731f2aa07c595d035430f Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 17 Jan 2023 05:36:50 -0500 Subject: [PATCH 155/273] orion 2-3, baseline and top config --- configs/exps/icml/is2re-all/baseline.yaml | 18 +++ .../exps/icml/is2re-all/fanet-orion-2.yaml | 10 +- .../exps/icml/is2re-all/fanet-orion-3.yaml | 58 ++++++++++ configs/exps/icml/is2re-all/top-config.yaml | 107 ++++++++++++++++++ configs/models/fanet.yaml | 1 + 5 files changed, 189 insertions(+), 5 deletions(-) create mode 100644 configs/exps/icml/is2re-all/baseline.yaml create mode 100644 configs/exps/icml/is2re-all/fanet-orion-3.yaml create mode 100644 configs/exps/icml/is2re-all/top-config.yaml diff --git a/configs/exps/icml/is2re-all/baseline.yaml b/configs/exps/icml/is2re-all/baseline.yaml new file mode 100644 index 0000000000..45102f9e29 --- /dev/null +++ b/configs/exps/icml/is2re-all/baseline.yaml @@ -0,0 +1,18 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:4 + partition: long + time: 20:00:00 + +default: + test_ri: True + mode: train + wandb_tags: 'baseline' + cp_data_to_tmpdir: true + +runs: + - config: dpp-is2re-all + note: 'Baseline 4 gpus' + - config: schnet-is2re-all + note: 'Baseline 4 gpus' diff --git a/configs/exps/icml/is2re-all/fanet-orion-2.yaml b/configs/exps/icml/is2re-all/fanet-orion-2.yaml index ae26af2e9e..cf88591af6 100644 --- a/configs/exps/icml/is2re-all/fanet-orion-2.yaml +++ b/configs/exps/icml/is2re-all/fanet-orion-2.yaml @@ -3,7 +3,7 @@ job: mem: 32GB cpus: 4 gres: gpu:rtx8000:1 - time: 10:00:00 + time: 14:00:00 partition: long default: @@ -32,15 +32,15 @@ default: orion: # Remember to change the experiment name if you change anything in the search space - n_jobs: 216 + n_jobs: 72 - unique_exp_name: fanet-is2re-all-v1 + unique_exp_name: fanet-is2re-all-v2 space: optim/max_epochs: fidelity(15, 30, base=6) optim/lr_initial: loguniform(6e-4, 4e-3, precision=2) model/hidden_channels: uniform(8, 19, discrete=True) - model/mp_type: choices(["base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "updown_local_env"]) + model/mp_type: choices(["base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "simple", "updown_local_env"]) model/num_filters: uniform(3, 18, discrete=True) model/num_gaussians: uniform(50, 170, discrete=True) model/num_interactions: uniform(3, 7, discrete=True) @@ -48,7 +48,7 @@ orion: model/phys_embeds: choices([True, False]) model/tag_hidden_channels: uniform(0, 3, discrete=True) model/complex_mp: choices([True, False]) - model/att_heads: choices([1,3,6]) + model/att_heads: choices([1,3,5]) model/second_layer_MLP: choices([True, False]) model/skip_co: choices(["add", "concat", False]) model/cutoff: choices([4.0, 6.0, 10.0]) diff --git a/configs/exps/icml/is2re-all/fanet-orion-3.yaml b/configs/exps/icml/is2re-all/fanet-orion-3.yaml new file mode 100644 index 0000000000..1337d47b1e --- /dev/null +++ b/configs/exps/icml/is2re-all/fanet-orion-3.yaml @@ -0,0 +1,58 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + time: 14:00:00 + partition: long + +default: + wandb_project: ocp-3 + config: fanet-is2re-all + mode: train + test_ri: true + wandb_tags: is2re-all, orion-3 + cp_data_to_tmpdir: true + graph_rewiring: remove-tag-0 + model: + graph_norm: True + frame_averaging: 2D + fa_frames: random + optim: + scheduler: LinearWarmupCosineAnnealingLR + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co + optim: lr_initial, warmup_steps + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 72 + + unique_exp_name: fanet-is2re-all-v2 + + space: + optim/max_epochs: fidelity(7, 15, base=6) + optim/lr_initial: loguniform(9e-4, 5e-3, precision=2) + model/hidden_channels: uniform(8, 16, discrete=True) + model/energy_head: choices(["weighted-av-final-embeds", False]) + model/mp_type: choices(["base", "updownscale", "updownscale_base", "updown_local_env"]) + model/num_filters: uniform(4, 18, discrete=True) + model/num_gaussians: uniform(30, 120, discrete=True) + model/num_interactions: uniform(4, 8, discrete=True) + model/pg_hidden_channels: uniform(1, 2, discrete=True) + model/phys_embeds: choices([True, False]) + model/tag_hidden_channels: uniform(1, 2, discrete=True) + model/complex_mp: choices([True, False]) + model/second_layer_MLP: choices([True, False]) + model/skip_co: choices(["concat", False]) + model/cutoff: choices([4.0, 6.0, 8.0]) + model/edge_embed_type: choices([all_rij, all]) + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 diff --git a/configs/exps/icml/is2re-all/top-config.yaml b/configs/exps/icml/is2re-all/top-config.yaml new file mode 100644 index 0000000000..0debe73c19 --- /dev/null +++ b/configs/exps/icml/is2re-all/top-config.yaml @@ -0,0 +1,107 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 15:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + edge_embed_type: all_rij + wandb_tags: 'best-config' + optim: + batch_size: 256 + eval_batch_size: 256 + cp_data_to_tmpdir: true + +runs: + - config: fanet-is2re-all + note: 'top-1-FA' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale + phys_embeds: False + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: False + graph_norm: True + hidden_channels: 352 + num_filters: 448 + num_gaussians: 99 + num_interactions: 6 + second_layer_MLP: True + skip_co: concat + optim: + lr_initial: 0.0019 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 9 + - config: fanet-is2re-all + note: 'top-1-FA' + frame_averaging: 2D + fa_frames: all + model: + mp_type: updownscale + phys_embeds: False + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: False + graph_norm: True + hidden_channels: 352 + num_filters: 448 + num_gaussians: 99 + num_interactions: 6 + second_layer_MLP: True + skip_co: concat + optim: + lr_initial: 0.0019 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 9 + - config: fanet-is2re-all + note: 'top-1-FA' + frame_averaging: DA + model: + mp_type: updownscale + phys_embeds: False + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: False + graph_norm: True + hidden_channels: 352 + num_filters: 448 + num_gaussians: 99 + num_interactions: 6 + second_layer_MLP: True + skip_co: concat + optim: + lr_initial: 0.0019 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 9 + - config: fanet-is2re-all + note: 'top-1-FA' + frame_averaging: 3D + fa_frames: random + model: + mp_type: updownscale + phys_embeds: False + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: False + graph_norm: True + hidden_channels: 352 + num_filters: 448 + num_gaussians: 99 + num_interactions: 6 + second_layer_MLP: True + skip_co: concat + optim: + lr_initial: 0.0019 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 9 diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml index fe94635ca1..a8eb727333 100644 --- a/configs/models/fanet.yaml +++ b/configs/models/fanet.yaml @@ -79,6 +79,7 @@ is2re: batch_size: 256 eval_batch_size: 256 lr_initial: 0.001 + lr_gamma: 0.1 lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma - 18000 - 27000 From b4c56af8529d974de2322e06619a826901fcc6c8 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 17 Jan 2023 06:11:03 -0500 Subject: [PATCH 156/273] skip co with atom concat --- configs/exps/icml/is2re-all/fanet-orion-3.yaml | 1 + ocpmodels/models/fanet.py | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/configs/exps/icml/is2re-all/fanet-orion-3.yaml b/configs/exps/icml/is2re-all/fanet-orion-3.yaml index 1337d47b1e..87f498043f 100644 --- a/configs/exps/icml/is2re-all/fanet-orion-3.yaml +++ b/configs/exps/icml/is2re-all/fanet-orion-3.yaml @@ -20,6 +20,7 @@ default: fa_frames: random optim: scheduler: LinearWarmupCosineAnnealingLR + eval_every: 0.5 note: model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co optim: lr_initial, warmup_steps diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py index 1d2a2a3bc2..28255ddf46 100644 --- a/ocpmodels/models/fanet.py +++ b/ocpmodels/models/fanet.py @@ -496,7 +496,7 @@ class FANet(BaseModel): (default: :obj:`50`) second_layer_MLP (bool): use 2-layers MLP at the end of the Embedding block. skip_co (str): add a skip connection between each interaction block and - energy-head. + energy-head. ("add", False, "concat", "concat_atom") edge_embed_type (str, in {'rij','all_rij','sh', 'all'}): input feature of the edge embedding block. edge_embed_hidden (int): size of edge representation. @@ -590,6 +590,11 @@ def __init__(self, **kwargs): # Skip co if self.skip_co == "concat": self.mlp_skip_co = Linear((kwargs["num_interactions"] + 1), 1) + elif self.skip_co == "concat_atom": + self.mlp_skip_co = Linear( + ((kwargs["num_interactions"] + 1) * kwargs["hidden_channels"]), + kwargs["hidden_channels"], + ) @conditional_grad(torch.enable_grad()) def forces_forward(self, preds): @@ -651,13 +656,19 @@ def energy_forward(self, data): # Interaction blocks energy_skip_co = [] for interaction in self.interaction_blocks: - if self.skip_co: + if self.skip_co == "concat_atom": + energy_skip_co.append(h) + elif self.skip_co: energy_skip_co.append( self.output_block(h, edge_index, edge_weight, batch, alpha) ) h = h + interaction(h, edge_index, e) - # Output block + # Atom skip-co + if self.skip_co == "concat_atom": + energy_skip_co.append(h) + h = self.act(self.mlp_skip_co(torch.cat(energy_skip_co, dim=1))) + energy = self.output_block(h, edge_index, edge_weight, batch, alpha) # Skip-connection From 48d8e670fcdbfab596d0ae4e8ee49b88ba63b24f Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Tue, 17 Jan 2023 06:13:56 -0500 Subject: [PATCH 157/273] update orion-3 --- configs/exps/icml/is2re-all/fanet-orion-3.yaml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/configs/exps/icml/is2re-all/fanet-orion-3.yaml b/configs/exps/icml/is2re-all/fanet-orion-3.yaml index 87f498043f..8daecd138c 100644 --- a/configs/exps/icml/is2re-all/fanet-orion-3.yaml +++ b/configs/exps/icml/is2re-all/fanet-orion-3.yaml @@ -17,7 +17,6 @@ default: model: graph_norm: True frame_averaging: 2D - fa_frames: random optim: scheduler: LinearWarmupCosineAnnealingLR eval_every: 0.5 @@ -36,22 +35,24 @@ orion: unique_exp_name: fanet-is2re-all-v2 space: - optim/max_epochs: fidelity(7, 15, base=6) - optim/lr_initial: loguniform(9e-4, 5e-3, precision=2) - model/hidden_channels: uniform(8, 16, discrete=True) + model/complex_mp: choices([True, False]) + model/cutoff: choices([4.0, 6.0, 8.0]) + model/edge_embed_type: choices(["all_rij", "all"]) model/energy_head: choices(["weighted-av-final-embeds", False]) + model/fa_frames: choices(["random", "se3-random"]) + model/hidden_channels: uniform(8, 16, discrete=True) + model/max_num_neighbors: choices([30, 40, 50]) model/mp_type: choices(["base", "updownscale", "updownscale_base", "updown_local_env"]) model/num_filters: uniform(4, 18, discrete=True) model/num_gaussians: uniform(30, 120, discrete=True) model/num_interactions: uniform(4, 8, discrete=True) model/pg_hidden_channels: uniform(1, 2, discrete=True) model/phys_embeds: choices([True, False]) - model/tag_hidden_channels: uniform(1, 2, discrete=True) - model/complex_mp: choices([True, False]) model/second_layer_MLP: choices([True, False]) model/skip_co: choices(["concat", False]) - model/cutoff: choices([4.0, 6.0, 8.0]) - model/edge_embed_type: choices([all_rij, all]) + model/tag_hidden_channels: uniform(1, 2, discrete=True) + optim/lr_initial: loguniform(9e-4, 5e-3, precision=2) + optim/max_epochs: fidelity(7, 15, base=6) algorithms: asha: seed: 123 From 974fb12e608a0e1568d8688f832145724f734a11 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 17 Jan 2023 10:36:41 -0500 Subject: [PATCH 158/273] more explicit reason --- .../qm7x/{schnet.yaml => schnet-fanet.yaml} | 15 +++++++++++++-- configs/models/tasks/qm7x.yaml | 1 - ocpmodels/modules/scheduler.py | 17 +++++++++++------ 3 files changed, 24 insertions(+), 9 deletions(-) rename configs/exps/qm7x/{schnet.yaml => schnet-fanet.yaml} (79%) diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet-fanet.yaml similarity index 79% rename from configs/exps/qm7x/schnet.yaml rename to configs/exps/qm7x/schnet-fanet.yaml index b5ecf97358..8adab7d58a 100644 --- a/configs/exps/qm7x/schnet.yaml +++ b/configs/exps/qm7x/schnet-fanet.yaml @@ -42,7 +42,7 @@ default: factor: 0.5 threshold: 0.001 threshold_mode: abs - min_lr: 0.00001 + min_lr: 0.000001 verbose: true model: hidden_channels: 256 @@ -69,4 +69,15 @@ runs: - model: num_interactions: 6 num_gaussians: 20 - hidden_channels: 512 \ No newline at end of file + hidden_channels: 512 + - config: fanet-qm7x-all + model: + force_decoder_type: mlp + edge_embed_type: all_rij + regress_forces: direct_with_gradient_target + - config: fanet-qm7x-all + model: + force_decoder_type: mlp + edge_embed_type: all_rij + mp_type: updownscale_base + regress_forces: direct_with_gradient_target diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml index 0a6fa33094..81d9ace719 100644 --- a/configs/models/tasks/qm7x.yaml +++ b/configs/models/tasks/qm7x.yaml @@ -6,7 +6,6 @@ default: otf_graph: False max_num_neighbors: 40 use_pbc: False - force_decoder_type: null task: dataset: qm7x diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index e93fcf78a7..efab892f7e 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -177,16 +177,21 @@ def should_stop(self, metric, lr=None): self.counter += 1 if self.counter >= self.patience: - self.early_stop = True + self.early_stop = "metric" if lr is not None and lr <= self.min_lr: - self.early_stop = True + self.early_stop = "lr" return self.early_stop @property def reason(self): - return ( - f"Early stopping after {self.counter} steps with no improvement:\n" - + " -> ".join([f"{m:.6f}" for m in self.metrics[-self.patience :]]) - ) + if self.early_stop == "metric": + return ( + f"Early stopping after {self.counter} steps with no improvement:\n" + + " -> ".join([f"{m:.6f}" for m in self.metrics[-self.patience :]]) + ) + elif self.early_stop == "lr": + return f"Early stopping because learning rate reached {self.min_lr}" + + return "" From f00c655cc1666b673b8bef86a59bc686f46be088 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 17 Jan 2023 15:42:48 -0500 Subject: [PATCH 159/273] ES warmup epochs --- ocpmodels/modules/scheduler.py | 7 ++++++- ocpmodels/trainers/base_trainer.py | 15 ++++++++------- ocpmodels/trainers/single_trainer.py | 2 +- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index efab892f7e..7c0a5fc071 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -106,6 +106,7 @@ def __init__( min_abs_change=1e-5, store_all_steps=True, min_lr=-1, + warmup_epochs=-1, ): """ Whether train should stop or not. @@ -130,6 +131,7 @@ def __init__( self.min_abs_change = min_abs_change self.store_all_steps = store_all_steps self.min_lr = min_lr + self.warmup_epochs = warmup_epochs self.metrics = [] if self.mode == "min": @@ -141,7 +143,7 @@ def __init__( self.early_stop = "" - def should_stop(self, metric, lr=None): + def should_stop(self, metric, lr=None, epoch=None): """ Returns why the training should stop: • Empty string if the training shouldn't stop @@ -176,6 +178,9 @@ def should_stop(self, metric, lr=None): else: self.counter += 1 + if self.warmup_epochs > 0 and epoch is not None and epoch < self.warmup_epochs: + self.counter = 0 + if self.counter >= self.patience: self.early_stop = "metric" diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index e22a648d03..a75db63bbe 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -36,7 +36,7 @@ from ocpmodels.common.graph_transforms import RandomReflect, RandomRotate from ocpmodels.common.registry import registry from ocpmodels.common.timer import Times -from ocpmodels.common.utils import JOB_ID, get_commit_hash, save_checkpoint +from ocpmodels.common.utils import JOB_ID, get_commit_hash, save_checkpoint, resolve from ocpmodels.datasets.data_transforms import FrameAveraging, get_transforms from ocpmodels.modules.evaluator import Evaluator from ocpmodels.modules.exponential_moving_average import ( @@ -52,7 +52,7 @@ class BaseTrainer(ABC): def __init__(self, **kwargs): run_dir = kwargs["run_dir"] - model_name = kwargs["model"].pop("name") + model_name = kwargs["model"].pop("name", kwargs["model_name"]) kwargs["model"]["graph_rewiring"] = kwargs.get("graph_rewiring") self.config = { @@ -60,9 +60,9 @@ def __init__(self, **kwargs): "model_name": model_name, "gpus": dist_utils.get_world_size() if not kwargs["cpu"] else 0, "commit": get_commit_hash(), - "checkpoint_dir": str(Path(run_dir) / "checkpoints"), - "results_dir": str(Path(run_dir) / "results"), - "logs_dir": str(Path(run_dir) / "logs"), + "checkpoint_dir": str(resolve(run_dir) / "checkpoints"), + "results_dir": str(resolve(run_dir) / "results"), + "logs_dir": str(resolve(run_dir) / "logs"), } self.sigterm = False @@ -84,9 +84,10 @@ def __init__(self, **kwargs): self.samplers = {} self.loaders = {} self.early_stopper = EarlyStopper( - patience=15, - min_abs_change=1e-5, + patience=self.config["optim"].get("es_patience") or 15, + min_abs_change=self.config["optim"].get("es_min_abs_change") or 1e-5, min_lr=self.config["optim"].get("min_lr", -1), + warmup_epochs=self.config["optim"].get("es_warmup_epochs") or -1, ) if torch.cuda.is_available() and not self.cpu: diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 5a6ed887b1..ef7fc5fff6 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -334,7 +334,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): training_state=False, ) if self.early_stopper.should_stop( - current_val_metric, self.scheduler.get_lr() + current_val_metric, self.scheduler.get_lr(), self.epoch ): print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n") if self.logger: From 12c189967987b76e59fd1fa644c00f7a3d527ba7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 17 Jan 2023 15:43:21 -0500 Subject: [PATCH 160/273] enable `--continue_from_dir` arg --- main.py | 6 -- ocpmodels/common/flags.py | 5 +- ocpmodels/common/logger.py | 2 +- ocpmodels/common/utils.py | 135 +++++++++++++++++++++++++------------ 4 files changed, 96 insertions(+), 52 deletions(-) diff --git a/main.py b/main.py index b3f02d599b..9a756f817a 100644 --- a/main.py +++ b/main.py @@ -166,12 +166,6 @@ def run(self, orion_exp=None): parser = flags.get_parser() args, override_args = parser.parse_known_args() args = update_from_sbatch_py_vars(args) - if not args.config: - args.config = "sfarinet-is2re-10k" - # args.checkpoint = "checkpoints/2022-04-26-12-23-28-schnet/checkpoint.pt" - warnings.warn( - f"\n>>>> No config is provided. Defaulting to {args.config} chosen\n" - ) if args.logdir: args.logdir = resolve(args.logdir) diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 77cb140cc3..f7da16d626 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -86,6 +86,9 @@ def add_core_args(self): self.parser.add_argument( "--checkpoint", type=str, help="Model checkpoint to load" ) + self.parser.add_argument( + "--continue_from_dir", type=str, help="Run to continue, loading its config" + ) self.parser.add_argument( "--timestamp-id", default=None, @@ -109,7 +112,7 @@ def add_core_args(self): ) self.parser.add_argument( "--logdir", - default="$SCRATCH/ocp/runs/$SLURM_JOB_ID", + default=Path("$SCRATCH/ocp/runs/$SLURM_JOB_ID"), type=Path, help="Where to store logs", ) diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index 380401332c..b628704fcf 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -133,7 +133,7 @@ def __init__(self, trainer_config): self.url = wandb.run.get_url() if self.url: - with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f: + with open(Path(self.trainer_config["run_dir"]) / "wandb_url.txt", "w") as f: f.write(self.url + "\n") if not CLUSTER.drac: self.collect_output_files(policy="live") diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index edade03673..68029bf367 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -6,6 +6,7 @@ """ import ast +import argparse import collections import copy import glob @@ -973,6 +974,50 @@ def load_config_legacy(path: str, previous_includes: list = []): return config, duplicates_warning, duplicates_error +def set_cpus_to_workers(config): + if not config.get("no_cpus_to_workers"): + cpus = count_cpus() + gpus = count_gpus() + if cpus is not None: + if gpus == 0: + workers = cpus - 1 + else: + workers = cpus // gpus + if not config["silent"]: + print( + f"Overriding num_workers from {config['optim']['num_workers']}", + f"to {workers} to match the machine's CPUs.", + "Use --no_cpus_to_workers=true to disable this behavior.", + ) + config["optim"]["num_workers"] = workers + return config + + +def check_regress_forces(config): + if "regress_forces" in config["model"]: + if config["model"]["regress_forces"] == "": + config["model"]["regress_forces"] = False + if not isinstance(config["model"]["regress_forces"], str): + if config["model"]["regress_forces"] is False: + config["model"]["regress_forces"] = "" + else: + raise ValueError( + "regress_forces must be False or a string: " + + "'from_energy' or 'direct' or 'direct_with_gradient_target'" + + f". Received: `{str(config['model']['regress_forces'])}`" + ) + elif config["model"]["regress_forces"] not in { + "from_energy", + "direct", + "direct_with_gradient_target", + }: + raise ValueError( + "regress_forces must be False or a string: " + + "'from_energy' or 'direct' or 'direct_with_gradient_target'" + + f". Received: `{str(config['model']['regress_forces'])}`" + ) + + def load_config(config_str): model, task, split = config_str.split("-") conf_path = ROOT / "configs" / "models" @@ -1002,74 +1047,76 @@ def load_config(config_str): def build_config(args, args_override): + config = overrides = continue_config = {} if args.config_yml: raise ValueError( "Using LEGACY config format. Please update your config to the new format." ) - config = load_config(args.config) - - # Check for overridden parameters. + args_dict_with_defaults = {k: v for k, v in vars(args).items() if v is not None} if args_override != []: overrides = create_dict_from_args(args_override) - config = merge_dicts(config, overrides) - config = merge_dicts(config, {k: v for k, v in vars(args).items() if v is not None}) + if args.continue_from_dir: + cont_dir = Path(args.continue_from_dir) + best_ckpt = cont_dir / "checkpoints/best_checkpoint.pt" + if not best_ckpt.exists(): + print( + f"💥 Could not find best checkpoint at {str(best_ckpt)}. " + + "Please make sure the directory is correct." + ) + else: + continue_config = torch.load(str(best_ckpt), map_location="cpu")["config"] + continue_config["checkpoint"] = str( + sorted( + cont_dir.glob("checkpoints/checkpoint-*.pt"), + key=lambda c: float(c.stem.split("-")[-1]), + )[-1] + ) + print( + "✅ Loading config from continuing dir and latest checkpoint:", + continue_config["checkpoint"], + ) + args.config = continue_config["config"] + + config = load_config(args.config) + config = merge_dicts(config, args_dict_with_defaults) + config = merge_dicts(config, overrides) config["data_split"] = args.config.split("-")[-1] config["run_dir"] = resolve(config["run_dir"]) config["slurm"] = {} config["job_id"] = JOB_ID or "no-job-id" config["job_ids"] = JOB_ID or "no-job-id" config["cluster_name"] = CLUSTER.name + config["world_size"] = args.num_nodes * args.num_gpus - if "regress_forces" in config["model"]: - if config["model"]["regress_forces"] == "": - config["model"]["regress_forces"] = False - if not isinstance(config["model"]["regress_forces"], str): - if config["model"]["regress_forces"] is False: - config["model"]["regress_forces"] = "" - else: - raise ValueError( - "regress_forces must be False or a string: " - + "'from_energy' or 'direct' or 'direct_with_gradient_target'" - + f". Received: `{str(config['model']['regress_forces'])}`" - ) - elif config["model"]["regress_forces"] not in { - "from_energy", - "direct", - "direct_with_gradient_target", - }: - raise ValueError( - "regress_forces must be False or a string: " - + "'from_energy' or 'direct' or 'direct_with_gradient_target'" - + f". Received: `{str(config['model']['regress_forces'])}`" - ) + if continue_config: + dirs_k_v = [(k, v) for k, v in config.items() if "dir" in k] + dataset_config = copy.deepcopy(config["dataset"]) + config = merge_dicts( + continue_config, + {k: resolve(v) if isinstance(v, str) else v for k, v in dirs_k_v}, + ) + config["dataset"] = dataset_config + config = merge_dicts(config, cli_args_dict()) + config = merge_dicts(config, overrides) + check_regress_forces(config) + config = set_cpus_to_workers(config) config = set_qm9_target_stats(config) config = set_qm7x_target_stats(config) config = override_drac_paths(config) - if not config["no_cpus_to_workers"]: - cpus = count_cpus() - gpus = count_gpus() - if cpus is not None: - if gpus == 0: - workers = cpus - 1 - else: - workers = cpus // gpus - if not config["silent"]: - print( - f"Overriding num_workers from {config['optim']['num_workers']}", - f"to {workers} to match the machine's CPUs.", - "Use --no_cpus_to_workers=true to disable this behavior.", - ) - config["optim"]["num_workers"] = workers - config["world_size"] = args.num_nodes * args.num_gpus - return config +def cli_args_dict(): + dummy = argparse.ArgumentParser() + _, cli_args = dummy.parse_known_args() + return create_dict_from_args(cli_args) + + def create_grid(base_config, sweep_file): def _flatten_sweeps(sweeps, root_key="", sep="."): flat_sweeps = [] From 4d6f41f1bd7b88a9e5930485123f452d504961d8 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 17 Jan 2023 15:44:08 -0500 Subject: [PATCH 161/273] use resolve --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 68029bf367..62a9104692 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1059,7 +1059,7 @@ def build_config(args, args_override): overrides = create_dict_from_args(args_override) if args.continue_from_dir: - cont_dir = Path(args.continue_from_dir) + cont_dir = resolve(args.continue_from_dir) best_ckpt = cont_dir / "checkpoints/best_checkpoint.pt" if not best_ckpt.exists(): print( From ec1fa92b5cc89e09d35616c844288bb92c051eae Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 17 Jan 2023 15:54:38 -0500 Subject: [PATCH 162/273] improve --- ocpmodels/common/utils.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 62a9104692..8aafd6c51e 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1060,24 +1060,19 @@ def build_config(args, args_override): if args.continue_from_dir: cont_dir = resolve(args.continue_from_dir) - best_ckpt = cont_dir / "checkpoints/best_checkpoint.pt" - if not best_ckpt.exists(): + ckpts = list(cont_dir.glob("checkpoints/checkpoint-*.pt")) + if not ckpts: print( - f"💥 Could not find best checkpoint at {str(best_ckpt)}. " + f"💥 Could not find checkpoints in {str(cont_dir)}. " + "Please make sure the directory is correct." ) else: - continue_config = torch.load(str(best_ckpt), map_location="cpu")["config"] - continue_config["checkpoint"] = str( - sorted( - cont_dir.glob("checkpoints/checkpoint-*.pt"), - key=lambda c: float(c.stem.split("-")[-1]), - )[-1] - ) - print( - "✅ Loading config from continuing dir and latest checkpoint:", - continue_config["checkpoint"], + latest_ckpt = str( + sorted(ckpts, key=lambda c: float(c.stem.split("-")[-1]))[-1] ) + continue_config["checkpoint"] = str(latest_ckpt) + continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] + print("✅ Loading config from cont dir and latest checkpoint:", latest_ckpt) args.config = continue_config["config"] config = load_config(args.config) From 5777bd257be64a467c2f9b4ad3dbd27128523423 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 17 Jan 2023 15:56:15 -0500 Subject: [PATCH 163/273] update --- configs/exps/icml/qm9/fanet-manual.yaml | 12 +++--- configs/exps/qm7x/schnet-fanet.yaml | 57 +++++++++++++++++++------ 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml index 0fc5b23986..9a26c42d33 100644 --- a/configs/exps/icml/qm9/fanet-manual.yaml +++ b/configs/exps/icml/qm9/fanet-manual.yaml @@ -25,18 +25,20 @@ default: warmup_steps: 3000 # parameters EMA ema_decay: 0.999 - decay_steps: max_steps - scheduler: LinearWarmupCosineAnnealingLR batch_size: 64 - initial_lr: 0.001 + initial_lr: 0.0005 max_epochs: 1500 loss_energy: mse loss_force: mse + # early stopping + es_patience: 20 + es_es_min_abs_change: 0.000001 + es_warmup_epochs: 500 # all below is for the scheduler scheduler: ReduceLROnPlateau mode: min factor: 0.5 - threshold: 0.001 + threshold: 0.0001 threshold_mode: abs min_lr: 0.00001 verbose: true @@ -70,8 +72,6 @@ runs: cutoff: 6.0 - optim: batch_size: 128 - - optim: - batch_size: 512 - model: energy_head: "" - model: diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml index 8adab7d58a..165a492eff 100644 --- a/configs/exps/qm7x/schnet-fanet.yaml +++ b/configs/exps/qm7x/schnet-fanet.yaml @@ -1,7 +1,7 @@ # trainset has 4068193 samples job: mem: 32GB - cpus: 8 + cpus: 4 gres: gpu:16gb:1 partition: long code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 @@ -25,15 +25,16 @@ default: model: name, num_gaussians, hidden_channels, num_filters, num_interactions optim: batch_size, lr_initial optim: - batch_size: 512 - warmup_steps: 1000 + batch_size: 32 + max_epochs: 100 + warmup_steps: 3000 lr_initial: 0.0005 eval_every: 0.34 # parameters EMA ema_decay: 0.999 - energy_coefficient: 1 - energy_grad_coefficient: 10 - force_coefficient: 100 + energy_coefficient: 0.001 + energy_grad_coefficient: 0 + force_coefficient: 0.999 loss_energy: mse loss_force: mse # all below is for the scheduler @@ -58,26 +59,56 @@ runs: lr_initial: 0.001 batch_size: 1024 - optim: - batch_size: 256 - - optim: - lr_initial: 0.001 - - optim: - lr_initial: 0.001 - batch_size: 2048 + batch_size: 128 - model: hidden_channels: 512 - model: num_interactions: 6 - num_gaussians: 20 + num_gaussians: 50 hidden_channels: 512 + - config: fanet-qm7x-all + optim: + initial_lr: 0.0005 + energy_coefficient: 0.001 + energy_grad_coefficient: 0.01 + force_coefficient: 0.989 model: + graph_norm: true force_decoder_type: mlp edge_embed_type: all_rij regress_forces: direct_with_gradient_target - config: fanet-qm7x-all + optim: + initial_lr: 0.0001 + energy_coefficient: 0.001 + energy_grad_coefficient: 0.01 + force_coefficient: 0.989 model: + graph_norm: true force_decoder_type: mlp edge_embed_type: all_rij + regress_forces: direct_with_gradient_target + - config: fanet-qm7x-all + optim: + initial_lr: 0.0001 + energy_coefficient: 0.001 + energy_grad_coefficient: 0.01 + force_coefficient: 0.989 + model: + graph_norm: false + force_decoder_type: mlp + edge_embed_type: all_rij + regress_forces: direct_with_gradient_target + - config: fanet-qm7x-all + optim: + initial_lr: 0.0001 + energy_coefficient: 0.001 + energy_grad_coefficient: 0.01 + force_coefficient: 0.989 + model: + graph_norm: true + force_decoder_type: mlp mp_type: updownscale_base + edge_embed_type: all_rij regress_forces: direct_with_gradient_target From 9dfcce642c71f5caf557784421b63d74f6e26583 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 17 Jan 2023 16:08:44 -0500 Subject: [PATCH 164/273] update fanet orion qm9 --- configs/exps/icml/qm9/fanet-manual.yaml | 2 +- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 27 +++++++++++++++------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml index 9a26c42d33..bf63636e08 100644 --- a/configs/exps/icml/qm9/fanet-manual.yaml +++ b/configs/exps/icml/qm9/fanet-manual.yaml @@ -32,7 +32,7 @@ default: loss_force: mse # early stopping es_patience: 20 - es_es_min_abs_change: 0.000001 + es_min_abs_change: 0.000001 es_warmup_epochs: 500 # all below is for the scheduler scheduler: ReduceLROnPlateau diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index 97aa0a69bf..88e871f0a4 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -14,19 +14,29 @@ default: wandb_tags: qm9, orion log_train_every: 100 optim: - warmup_steps: 2000 + warmup_steps: 3000 # parameters EMA ema_decay: 0.999 - decay_steps: max_steps - scheduler: LinearWarmupCosineAnnealingLR - batch_size: 64 + loss_energy: mse + # early stopping + es_patience: 20 + es_min_abs_change: 0.000001 + es_warmup_epochs: 500 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.5 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.00001 + verbose: true note: model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm optim: batch_size, lr_initial _root_: frame_averaging, fa_frames orion_mult_factor: value: 32 - targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, batch_size frame_averaging: 3D fa_frames: random model: @@ -36,11 +46,12 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-qm9-v2.0.0 + unique_exp_name: fanet-qm9-v3.0.0 space: - optim/max_epochs: fidelity(100, 2000, base=5) - optim/lr_initial: loguniform(1e-4, 5e-3, precision=2) + optim/max_epochs: fidelity(150, 2000, base=5) + optim/batch_size: uniform(1, 4, discrete=True) + optim/lr_initial: loguniform(1e-4, 1e-3, precision=3) model/graph_norm: choices([True, False]) model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) model/hidden_channels: uniform(5, 16, discrete=True) From b7896c5f515e155d97be6fd98c1b16906d21526b Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 17 Jan 2023 16:40:52 -0500 Subject: [PATCH 165/273] fix model name --- configs/exps/qm7x/schnet-fanet.yaml | 2 +- ocpmodels/trainers/base_trainer.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml index 165a492eff..dfa52ea54f 100644 --- a/configs/exps/qm7x/schnet-fanet.yaml +++ b/configs/exps/qm7x/schnet-fanet.yaml @@ -35,7 +35,7 @@ default: energy_coefficient: 0.001 energy_grad_coefficient: 0 force_coefficient: 0.999 - loss_energy: mse + loss_energy: mae loss_force: mse # all below is for the scheduler scheduler: ReduceLROnPlateau diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index a75db63bbe..71ebca2585 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -52,7 +52,9 @@ class BaseTrainer(ABC): def __init__(self, **kwargs): run_dir = kwargs["run_dir"] - model_name = kwargs["model"].pop("name", kwargs["model_name"]) + model_name = kwargs["model"].pop( + "name", kwargs.get("model_name", "Unknown - base_trainer issue") + ) kwargs["model"]["graph_rewiring"] = kwargs.get("graph_rewiring") self.config = { From 1d3ca4831d6be405f6960fc494492380eeb01bf5 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 16:24:22 -0500 Subject: [PATCH 166/273] improve exp manager --- configs/exps/icml/qm9/fanet-manual.yaml | 29 +++-- configs/exps/qm7x/schnet-fanet.yaml | 153 ++++++++++++++---------- main.py | 4 - ocpmodels/common/exp_manager.py | 137 ++++++++++++++++++++- 4 files changed, 243 insertions(+), 80 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml index bf63636e08..79a65f829b 100644 --- a/configs/exps/icml/qm9/fanet-manual.yaml +++ b/configs/exps/icml/qm9/fanet-manual.yaml @@ -13,12 +13,9 @@ default: wandb_tags: qm9, fanet-qm9-v1.0.2-continued log_train_every: 100 note: - model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm - optim: batch_size, lr_initial + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type + optim: batch_size _root_: frame_averaging, fa_frames - orion_mult_factor: - value: 32 - targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels frame_averaging: 3D fa_frames: random optim: @@ -31,17 +28,18 @@ default: loss_energy: mse loss_force: mse # early stopping - es_patience: 20 + es_patience: 50 es_min_abs_change: 0.000001 es_warmup_epochs: 500 - # all below is for the scheduler + # all below is for the ReduceLROnPlateau scheduler scheduler: ReduceLROnPlateau mode: min - factor: 0.5 + factor: 0.75 threshold: 0.0001 threshold_mode: abs - min_lr: 0.00001 + min_lr: 0.000001 verbose: true + patience: 10 model: cutoff: 5.0 edge_embed_type: all_rij @@ -54,7 +52,7 @@ default: num_gaussians: 50 num_interactions: 5 otf_graph: false - pg_hidden_channels: 32 + pg_hidden_channels: 16 phys_embeds: true phys_hidden_channels: 0 second_layer_MLP: false @@ -76,3 +74,14 @@ runs: energy_head: "" - model: energy_head: "weighted-av-initial-embeds" + - model: + num_interactions: 4 + num_gaussians: 20 + hidden_channels: 512 + num_filters: 512 + - model: + mp_type: updownscale_base + num_interactions: 4 + num_gaussians: 20 + hidden_channels: 512 + num_filters: 512 diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml index dfa52ea54f..ece614e5f3 100644 --- a/configs/exps/qm7x/schnet-fanet.yaml +++ b/configs/exps/qm7x/schnet-fanet.yaml @@ -25,16 +25,16 @@ default: model: name, num_gaussians, hidden_channels, num_filters, num_interactions optim: batch_size, lr_initial optim: - batch_size: 32 + batch_size: 10 max_epochs: 100 warmup_steps: 3000 - lr_initial: 0.0005 + lr_initial: 0.0001 eval_every: 0.34 # parameters EMA ema_decay: 0.999 - energy_coefficient: 0.001 + energy_coefficient: 0.01 energy_grad_coefficient: 0 - force_coefficient: 0.999 + force_coefficient: 0.99 loss_energy: mae loss_force: mse # all below is for the scheduler @@ -46,69 +46,102 @@ default: min_lr: 0.000001 verbose: true model: - hidden_channels: 256 + hidden_channels: 128 num_filters: 128 - num_gaussians: 100 + num_gaussians: 20 num_interactions: 6 cutoff: 5.0 - regress_forces: "from_energy" + regress_forces: from_energy runs: - - {} - - optim: - lr_initial: 0.001 - batch_size: 1024 - - optim: - batch_size: 128 - - model: - hidden_channels: 512 - - model: - num_interactions: 6 - num_gaussians: 50 - hidden_channels: 512 + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True - - config: fanet-qm7x-all + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True optim: - initial_lr: 0.0005 - energy_coefficient: 0.001 - energy_grad_coefficient: 0.01 - force_coefficient: 0.989 - model: - graph_norm: true - force_decoder_type: mlp - edge_embed_type: all_rij - regress_forces: direct_with_gradient_target - - config: fanet-qm7x-all + lr_initial: 0.001 + batch_size: 100 + + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True optim: - initial_lr: 0.0001 - energy_coefficient: 0.001 - energy_grad_coefficient: 0.01 - force_coefficient: 0.989 - model: - graph_norm: true - force_decoder_type: mlp - edge_embed_type: all_rij - regress_forces: direct_with_gradient_target - - config: fanet-qm7x-all + lr_initial: 0.001 + batch_size: 256 + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: False + + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: False optim: - initial_lr: 0.0001 - energy_coefficient: 0.001 - energy_grad_coefficient: 0.01 - force_coefficient: 0.989 - model: - graph_norm: false - force_decoder_type: mlp - edge_embed_type: all_rij - regress_forces: direct_with_gradient_target - - config: fanet-qm7x-all + lr_initial: 0.001 + batch_size: 100 + + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: False optim: - initial_lr: 0.0001 - energy_coefficient: 0.001 - energy_grad_coefficient: 0.01 - force_coefficient: 0.989 - model: - graph_norm: true - force_decoder_type: mlp - mp_type: updownscale_base - edge_embed_type: all_rij - regress_forces: direct_with_gradient_target + lr_initial: 0.001 + batch_size: 256 + + # - config: fanet-qm7x-all + # model: + # graph_norm: true + # edge_embed_type: all_rij + # mp_type: updownscale_base + + # - config: fanet-qm7x-all + # optim: + # energy_coefficient: 0.01 + # energy_grad_coefficient: 0.1 + # force_coefficient: 0.89 + # lr_initial: 0.001 + # batch_size: 100 + # model: + # graph_norm: true + # edge_embed_type: all_rij + # mp_type: updownscale_base + # force_decoder_type: mlp + # regress_forces: direct_with_gradient_target + + # - config: fanet-qm7x-all + # optim: + # energy_coefficient: 0.01 + # energy_grad_coefficient: 0.1 + # force_coefficient: 0.89 + # lr_initial: 0.001 + # batch_size: 100 + # model: + # graph_norm: false + # force_decoder_type: mlp + # edge_embed_type: all_rij + # regress_forces: direct_with_gradient_target + # num_interactions: 4 + + # - config: fanet-qm7x-all + # optim: + # energy_coefficient: 0.01 + # energy_grad_coefficient: 0.1 + # force_coefficient: 0.89 + # lr_initial: 0.001 + # batch_size: 100 + # model: + # graph_norm: true + # force_decoder_type: mlp + # mp_type: updownscale_base + # edge_embed_type: all_rij + # regress_forces: direct_with_gradient_target + # num_interactions: 3 + # num_filters: 256 + # hidden_channels: 256 diff --git a/main.py b/main.py index 9a756f817a..2f1d88f939 100644 --- a/main.py +++ b/main.py @@ -112,10 +112,6 @@ def run(self, orion_exp=None): self.hparams, orion_race_condition = dist_utils.broadcast_from_master( self.hparams, orion_race_condition ) - if orion_race_condition: - if dist_utils.is_master(): - shutil.rmtree(self.trainer_config["run_dir"]) - return if self.hparams: print("\n💎 Received hyper-parameters from Orion:") print(dump(self.hparams), end="\n") diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index f59a540919..a89574096f 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -8,8 +8,13 @@ import sys import time from datetime import datetime +import yaml +from tqdm import tqdm -rundir = Path(os.environ["SCRATCH"]) / "ocp" / "runs" +RUN_DIR = Path(os.environ["SCRATCH"]) / "ocp" / "runs" +ROOT = Path(__file__).resolve().parent.parent.parent +EXP_OUT_DIR = ROOT / "data" / "exp_outputs" +MANAGER_CACHE = ROOT / "data" / "exp_manager_cache" class Manager: @@ -18,9 +23,13 @@ def __init__( orion_db_path="", name="", wandb_path="mila-ocp/ocp-qm", + rebuild_cache=False, + print_tracebacks=True, ): self.api = wandb.Api() self.wandb_path = wandb_path + self.rebuild_cache = rebuild_cache + self.print_tracebacks = print_tracebacks self.wandb_runs = [ r for r in self.api.runs(wandb_path) @@ -28,6 +37,12 @@ def __init__( and name in r.config.get("orion_exp_config_path", "") ] self.name = name + self.cache_path = MANAGER_CACHE / f"{self.name}.yaml" + self.cache = ( + yaml.safe_load(self.cache_path.read_text()) + if self.cache_path.exists() + else {} + ) self.trial_hparams_to_rundirs = defaultdict(list) self.exp = get_experiment( name=name, @@ -63,7 +78,12 @@ def __init__( [p.name for runs in self.trial_hparams_to_rundirs.values() for p in runs] ) print("\n") + self.discover_yamls() + self.discover_job_ids_from_yaml() + self.parse_output_files() self.print_status() + print("\n") + self.print_output_files_stats() def print_status(self): print("{:32} : {:4} ".format("Trials in experiment", len(self.trials))) @@ -120,7 +140,7 @@ def print_status(self): ) running = set(self.job_ids) & sq waiting = ( - set([j.parent.name for j in rundir.glob(f"*/{self.name}.exp")]) & sq + set([j.parent.name for j in RUN_DIR.glob(f"*/{self.name}.exp")]) & sq ) - running print( "{:32} : {}".format( @@ -136,7 +156,7 @@ def print_status(self): ) def discover_run_dirs(self): - for unique in rundir.glob(f"*/{self.name}--*.unique"): + for unique in RUN_DIR.glob(f"*/{self.name}--*.unique"): self.trial_hparams_to_rundirs[unique.stem.split("--")[-1]].append( unique.parent ) @@ -167,6 +187,100 @@ def get_reserved_wandb_runs(self): def print_wandb_query(self): print(f"{'WandB runs query:':32}\n" + "(" + "|".join(self.job_ids) + ")") + def parse_output_files(self): + if "job_state" not in self.cache: + self.cache["job_state"] = {} + for j in tqdm(self.cache["all_job_ids"], desc="Parsing output files"): + if j in self.cache["job_state"] and not self.rebuild_cache: + continue + out_file = RUN_DIR / j / "output-0.txt" + + if not out_file.exists(): + self.cache["job_state"][j] = "No output file (RaceCondition)" + continue + + out_txt = out_file.read_text() + if "RaceCondition" in out_txt: + self.cache["job_state"][j] = "RaceCondition" + elif "Traceback" in out_txt: + self.cache["job_state"][j] = ( + "Traceback: " + out_txt.split("Traceback")[1] + ) + elif "srun: Job step aborted" in out_txt: + if "slurmstepd" in out_txt and " CANCELLED AT " in out_txt: + self.cache["job_state"][j] = "Cancelled" + elif "eval_all_splits" in out_txt and "Final results" in out_txt: + self.cache["job_state"][j] = "Finished" + elif "nan_loss" in out_txt: + self.cache["job_state"][j] = "NaN loss" + else: + self.cache["job_state"][j] = "Unknown" + self.commit_cache() + + def print_output_files_stats(self): + print("Job status from output files:\n" + "-" * 29 + "\n") + stats = {} + for j, o in self.cache["job_state"].items(): + if "Traceback" in o: + if "Traceback" not in stats: + stats["Traceback"] = {"n": 0, "ids": [], "contents": []} + stats["Traceback"]["n"] += 1 + stats["Traceback"]["ids"].append(j) + stats["Traceback"]["contents"].append(o) + else: + if o not in stats: + stats[o] = {"n": 0, "ids": []} + stats[o]["n"] += 1 + stats[o]["ids"].append(j) + for s, v in stats.items(): + print(f"• {s:31}" + f": {v['n']} (" + " ".join(v["ids"]) + ")") + if stats["Traceback"]["n"] > 0 and self.print_tracebacks: + print("\nTraceback contents:\n" + "-" * 19 + "\n") + print( + f"\n\n{'|' * 50}\n{'|' * 50}\n{'|' * 50}\n".join( + f"{j}:\n{o}" + for j, o in zip( + stats["Traceback"]["ids"], stats["Traceback"]["contents"] + ) + ) + ) + + def discover_job_ids_from_yaml(self): + all_jobs = ( + set(self.cache.get("all_job_ids", [])) if not self.rebuild_cache else set() + ) + for yaml_path in self.cache["exp_yamls"]: + lines = Path(yaml_path).read_text().splitlines() + jobs_line = [line for line in lines if "All jobs launched" in line][0] + jobs = [ + j.strip() + for j in jobs_line.split("All jobs launched: ")[-1].strip().split(", ") + ] + all_jobs |= set(jobs) + self.cache["all_job_ids"] = sorted(all_jobs) + self.commit_cache() + + def discover_yamls(self): + yamls = set() + if self.cache and not self.rebuild_cache: + cache_yamls = self.cache.get("exp_yamls") or [] + yamls |= set(cache_yamls) + for yaml_conf in EXP_OUT_DIR.glob("**/*.yaml"): + if str(yaml_conf) not in yamls: + yaml_txt = yaml_conf.read_text() + if self.name in yaml_txt: + y = yaml.safe_load(yaml_txt) + if y.get("orion", {}).get("unique_exp_name") == self.name: + yamls.add(str(yaml_conf)) + yamls = sorted(yamls) + self.cache["exp_yamls"] = yamls + self.commit_cache() + + def commit_cache(self): + if not self.cache_path.parent.exists(): + self.cache_path.parent.mkdir(parents=True) + self.cache_path.write_text(yaml.safe_dump(self.cache)) + @classmethod def help(self): return dedent( @@ -175,9 +289,11 @@ def help(self): Manager init() -------------- - orion_db_path -> (str or pathlib.Path) pointing to the orion db pickle file - name -> (str) unique orion experiment name in the db - wandb_path -> (str) path to the wandb project like "{entity}/{project}" + orion_db_path -> (str or pathlib.Path) pointing to the orion db pickle file + name -> (str) unique orion experiment name in the db + wandb_path -> (str) path to the wandb project like "{entity}/{project}" + rebuild_cache -> (bool, default: False) if True, will rebuild the output file cache from scratch + print_tracebacks -> (bool, default: False) if True, will print the Traceback contents in the output files ---------- Attributes @@ -220,6 +336,8 @@ def help(self): "name": None, "wandb_path": None, "watch": -1, + "rebuild_cache": False, + "print_tracebacks": False, } args = resolved_args(defaults=defaults) if args.help: @@ -230,6 +348,11 @@ def help(self): "In [1]: run ocpmodels/common/exp_manager.py", "name='ocp-qm9-orion-debug-v1.0.0' wandb_path='mila-ocp/ocp-3'", ) + print( + "In [1]: run ocpmodels/common/exp_manager.py", + "name='ocp-qm9-orion-debug-v1.0.0' wandb_path='mila-ocp/ocp-3'", + "print_tracebacks", + ) print("\n\n🧞 Manager help:") print(Manager.help()) sys.exit(0) @@ -257,6 +380,8 @@ def help(self): name=args.name, wandb_path=args.wandb_path, orion_db_path=orion_db_path, + rebuild_cache=args.rebuild_cache, + print_tracebacks=args.print_tracebacks, ) # m.print_wandb_query() From a7a0dee8514854263d77c5c05119c91f71643ce7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 17:24:09 -0500 Subject: [PATCH 167/273] use `get_and_move_orion_db_path` --- ocpmodels/common/exp_manager.py | 8 ++------ ocpmodels/common/utils.py | 32 +++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index a89574096f..b90e473742 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -10,9 +10,8 @@ from datetime import datetime import yaml from tqdm import tqdm +from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path -RUN_DIR = Path(os.environ["SCRATCH"]) / "ocp" / "runs" -ROOT = Path(__file__).resolve().parent.parent.parent EXP_OUT_DIR = ROOT / "data" / "exp_outputs" MANAGER_CACHE = ROOT / "data" / "exp_manager_cache" @@ -372,10 +371,7 @@ def help(self): "💃 Status of experiment", f"'{args.name}' and wandb entity/project '{args.wandb_path}':", ) - orion_db_path = str( - Path(__file__).resolve().parent.parent.parent - / f"data/orion/storage/{args.name}_db.pkl" - ) + orion_db_path = get_and_move_orion_db_path(args.name) m = Manager( name=args.name, wandb_path=args.wandb_path, diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 8aafd6c51e..8c89c73a22 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -5,8 +5,8 @@ LICENSE file in the root directory of this source tree. """ -import ast import argparse +import ast import collections import copy import glob @@ -24,6 +24,7 @@ from functools import wraps from itertools import product from pathlib import Path +from shutil import copyfile import numpy as np import torch @@ -66,6 +67,7 @@ def __getattr__(self, k: str): OCP_TASKS = {"s2ef", "is2re", "is2es"} ROOT = Path(__file__).resolve().parent.parent.parent JOB_ID = os.environ.get("SLURM_JOB_ID") +RUN_DIR = Path(os.environ["SCRATCH"]) / "ocp" / "runs" def set_max_fidelity(hparams, orion_exp): @@ -137,6 +139,31 @@ def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."): return updated_hparams +def get_and_move_orion_db_path(exp_name): + db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."]) + db_file = f"{db_id}_db.pkl" if not db_id.endswith("_db.pkl") else db_id + scratch_db = RUN_DIR.parent / "orion" / "storage" / db_file + scratch_db.parent.mkdir(parents=True, exist_ok=True) + if not scratch_db.exists(): + home_db = ROOT / f"data/orion/storage/{db_file}" + + if not home_db.exists(): + return scratch_db + + lock_file = home_db.parent / f"{db_file}.lock" + if not lock_file.exists(): + lock_file.touch() + copyfile(home_db, scratch_db) + print("Copied db from home to scratch.") + lock_file.unlink() + + while lock_file.exists(): + print("Waiting for lock to be released...") + time.sleep(1) + + return scratch_db + + def load_orion_exp(args): exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text()) @@ -147,8 +174,7 @@ def load_orion_exp(args): print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}") exp_name = args.orion_unique_exp_name or exp_config["unique_exp_name"] db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."]) - db_path = ROOT / "data" / "orion" / "storage" / f"{db_id}_db.pkl" - db_path.parent.mkdir(parents=True, exist_ok=True) + db_path = get_and_move_orion_db_path(db_id) experiment = build_experiment( storage={ "database": { From bf09c2c223fe832b8e01006e392921485049090d Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 17:40:49 -0500 Subject: [PATCH 168/273] cp lock --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 8c89c73a22..a62fa3f903 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -150,7 +150,7 @@ def get_and_move_orion_db_path(exp_name): if not home_db.exists(): return scratch_db - lock_file = home_db.parent / f"{db_file}.lock" + lock_file = home_db.parent / f"{db_file}.cp_lock" if not lock_file.exists(): lock_file.touch() copyfile(home_db, scratch_db) From f687fea87311430fd1322650fc221c6884802ee1 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 17:55:44 -0500 Subject: [PATCH 169/273] handle symlink --- ocpmodels/common/utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index a62fa3f903..4c61943cbb 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -24,7 +24,7 @@ from functools import wraps from itertools import product from pathlib import Path -from shutil import copyfile +from shutil import copyfile, move import numpy as np import torch @@ -154,7 +154,9 @@ def get_and_move_orion_db_path(exp_name): if not lock_file.exists(): lock_file.touch() copyfile(home_db, scratch_db) - print("Copied db from home to scratch.") + move(home_db, home_db.parent / f"{db_file}.bak") + os.symlink(str(scratch_db), str(home_db)) + print("Copied and symlinked db from home to scratch.") lock_file.unlink() while lock_file.exists(): From c24b7ee0eaed82d7fcddeb008f056a9acfde8db7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:06:50 -0500 Subject: [PATCH 170/273] add `rescale_with_hof` option --- .../models/qm7x-metadata/hof_rescales.json | 1 + configs/models/tasks/qm7x.yaml | 1 + main.py | 10 ---- ocpmodels/common/utils.py | 44 +++++++++++----- ocpmodels/datasets/qm7x.py | 29 ++++++++--- ocpmodels/modules/normalizer.py | 32 ++++++++++-- ocpmodels/trainers/base_trainer.py | 4 ++ ocpmodels/trainers/single_trainer.py | 29 +++++++++-- scripts/compute_qm7x_rescales.py | 51 +++++++++++++++++++ 9 files changed, 163 insertions(+), 38 deletions(-) create mode 100644 configs/models/qm7x-metadata/hof_rescales.json create mode 100644 scripts/compute_qm7x_rescales.py diff --git a/configs/models/qm7x-metadata/hof_rescales.json b/configs/models/qm7x-metadata/hof_rescales.json new file mode 100644 index 0000000000..2a0d05d0ee --- /dev/null +++ b/configs/models/qm7x-metadata/hof_rescales.json @@ -0,0 +1 @@ +{"mean": -1.373329520225525, "std": 0.3661123216152191, "about": "Statistics for y(=ePBE0+MBD) / sum(HOF) where HOF is the heat of formation of each element in the graph. This is computed over the train set only."} \ No newline at end of file diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml index 81d9ace719..c832e2c82f 100644 --- a/configs/models/tasks/qm7x.yaml +++ b/configs/models/tasks/qm7x.yaml @@ -36,6 +36,7 @@ default: target: ePBE0+MBD forces_target: totFOR std_divider: 1.0 + rescale_with_hof: True val_id: src: /network/projects/ocp/qm7x/processed normalize_labels: True # mean and std of target will be set by utils.py if this is True diff --git a/main.py b/main.py index 2f1d88f939..dc763ce3de 100644 --- a/main.py +++ b/main.py @@ -43,16 +43,6 @@ # os.environ["CUDA_LAUNCH_BLOCKING"] = "1" torch.multiprocessing.set_sharing_strategy("file_system") -try: - import ipdb # noqa: F401 - - os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace" -except: # noqa: E722 - print( - "`ipdb` is not installed. ", - "Consider `pip install ipdb` to improve your debugging experience.", - ) - def print_warnings(): warnings = [ diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 4c61943cbb..b7e337843f 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -515,6 +515,13 @@ def set_qm7x_target_stats(trainer_config): (ROOT / "configs" / "models" / "qm7x-metadata" / "stats.json").read_text() ) + hof_stats = json.loads( + ( + ROOT / "configs" / "models" / "qm7x-metadata" / "hof_rescales.json" + ).read_text() + ) + hof_stats.pop("about", None) + for d, dataset in deepcopy(trainer_config["dataset"]).items(): if d == "default_val": continue @@ -534,6 +541,11 @@ def set_qm7x_target_stats(trainer_config): trainer_config["dataset"][d]["grad_target_mean"] = mean trainer_config["dataset"][d]["grad_target_std"] = std / std_divider + if "train" in trainer_config["dataset"] and trainer_config["dataset"]["train"].get( + "rescale_with_hof" + ): + trainer_config["dataset"]["train"]["hof_rescales"] = hof_stats + return trainer_config @@ -845,6 +857,16 @@ def add_edge_distance_to_graph( def setup_imports(): from ocpmodels.common.registry import registry + try: + import ipdb # noqa: F401 + + os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace" + except: # noqa: E722 + print( + "`ipdb` is not installed. ", + "Consider `pip install ipdb` to improve your debugging experience.", + ) + # First, check if imports are already setup has_already_setup = registry.get("imports_setup", no_warning=True) if has_already_setup: @@ -1651,22 +1673,18 @@ def get_commit_hash(): def base_config(config, overrides={}): - from argparse import Namespace + from ocpmodels.common.flags import flags - n = Namespace() - n.num_gpus = 1 - n.num_nodes = 1 - n.config_yml = None - n.config = config + setup_imports() conf = build_config( - n, - [ - "run_dir=.", - "no_qm7x_cp=true", - "no_cpus_to_workers=true", - "silent=", - ], + *flags.get_parser().parse_known_args( + [ + f"--config={config}", + "--logger=dummy", + ] + ) ) + conf["cpu"] = not torch.cuda.is_available() return merge_dicts(conf, overrides) diff --git a/ocpmodels/datasets/qm7x.py b/ocpmodels/datasets/qm7x.py index 6ecf76ded9..f98774a60c 100644 --- a/ocpmodels/datasets/qm7x.py +++ b/ocpmodels/datasets/qm7x.py @@ -1,24 +1,28 @@ -import time -from torch.utils.data import Dataset +import pickle import random import re +import time from abc import abstractmethod from collections import defaultdict from collections.abc import Iterable from pathlib import Path -import pickle + import h5py +import lmdb import numpy as np import torch +from mendeleev.fetch import fetch_table from rdkit import Chem from rdkit.Chem import AllChem from scipy import spatial as sp from torch import as_tensor +from torch.utils.data import Dataset from torch_geometric.data import Data -from cosmosis.dataset import CDataset from tqdm import tqdm -import lmdb + +from cosmosis.dataset import CDataset from ocpmodels.common.registry import registry +from ocpmodels.common.utils import ROOT try: import orjson as json # noqa: F401 @@ -754,6 +758,10 @@ def __init__( for i in all_samples["splits"][split] ] + self.hofs = fetch_table("elements")["heat_of_formation"].values + self.hofs[np.isnan(self.hofs)] = self.hofs[~np.isnan(self.hofs)].mean() + self.hofs = torch.from_numpy(self.hofs).float() + self.transform = transform def __len__(self): @@ -785,6 +793,9 @@ def __getitem__(self, i): data.natoms = len(data.pos) data.tags = torch.full((data.natoms,), -1, dtype=torch.long) data.atomic_numbers = torch.tensor(data.atNUM, dtype=torch.long) + data.hofs = self.hofs[ + data.atomic_numbers.numpy().astype(int) - 1 # element 1 is at row 0 + ].sum() t1 = time.time_ns() if self.transform is not None: @@ -809,12 +820,14 @@ def close_db(self): if __name__ == "__main__": - from ocpmodels.datasets.qm7x import QM7XFromLMDB as QM7X + import json from pathlib import Path - from tqdm import tqdm + import numpy as np - import json + from tqdm import tqdm + from ocpmodels.common.data_parallel import ParallelCollater + from ocpmodels.datasets.qm7x import QM7XFromLMDB as QM7X src = Path("/network/projects/ocp/qm7x/processed") smp = Path("configs/models/qm7x-metadata/samples.json") diff --git a/ocpmodels/modules/normalizer.py b/ocpmodels/modules/normalizer.py index 302f0d6cef..bbe169eee9 100644 --- a/ocpmodels/modules/normalizer.py +++ b/ocpmodels/modules/normalizer.py @@ -19,6 +19,8 @@ def __init__(self, tensor=None, mean=None, std=None, device=None): if device is None: device = "cpu" + self.device = device + if tensor is not None: self.mean = torch.mean(tensor, dim=0).to(device) self.std = torch.std(tensor, dim=0).to(device) @@ -28,19 +30,43 @@ def __init__(self, tensor=None, mean=None, std=None, device=None): self.mean = torch.tensor(mean).to(device) self.std = torch.tensor(std).to(device) + self.hof_mean = None + self.hof_std = None + def to(self, device): self.mean = self.mean.to(device) self.std = self.std.to(device) + if self.hof_mean: + self.hof_mean = self.hof_mean.to(device) + if self.hof_std: + self.hof_std = self.hof_std.to(device) + self.device = device - def norm(self, tensor): + def norm(self, tensor, hofs=None): + if hofs is not None: + return tensor / hofs - self.hof_mean return (tensor - self.mean) / self.std - def denorm(self, normed_tensor): + def denorm(self, normed_tensor, hofs=None): + if hofs is not None: + return (normed_tensor + self.hof_mean) * hofs return normed_tensor * self.std + self.mean def state_dict(self): - return {"mean": self.mean, "std": self.std} + sd = {"mean": self.mean, "std": self.std} + if self.hof_rescales: + sd["hof_rescales"] = { + "mean": self.hof_mean, + "std": self.hof_std, + } + return sd def load_state_dict(self, state_dict): self.mean = state_dict["mean"].to(self.mean.device) self.std = state_dict["std"].to(self.mean.device) + if "hof_rescales" in state_dict: + self.set_hof_rescales(state_dict["hof_rescales"]) + + def set_hof_rescales(self, hof_rescales): + self.hof_mean = torch.tensor(hof_rescales["mean"], device=self.device) + self.hof_std = torch.tensor(hof_rescales["std"], device=self.device) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 71ebca2585..507d8eefc9 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -320,6 +320,10 @@ def load_datasets(self): std=self.normalizer["target_std"], device=self.device, ) + if "hof_rescales" in self.normalizer: + self.normalizers["target"].set_hof_rescales( + self.normalizer["hof_rescales"] + ) else: self.normalizers["target"] = Normalizer( tensor=self.datasets["train"].data.y[ diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index ef7fc5fff6..74c8217681 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -134,7 +134,14 @@ def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False) preds = self.model_forward(batch_list) if self.normalizers is not None and "target" in self.normalizers: - preds["energy"] = self.normalizers["target"].denorm(preds["energy"]) + hofs = None + if self.task_name == "qm7x": + hofs = torch.cat( + [batch.hofs.to(self.device) for batch in batch_list], dim=0 + ) + preds["energy"] = self.normalizers["target"].denorm( + preds["energy"], hofs=hofs + ) if self.normalizers is not None and "grad_target" in self.normalizers: self.normalizers["grad_target"].to(self.device) @@ -467,7 +474,12 @@ def compute_loss(self, preds, batch_list): ) if self.normalizer.get("normalize_labels", False): - target_normed = self.normalizers["target"].norm(energy_target) + hofs = None + if self.task_name == "qm7x": + hofs = torch.cat( + [batch.hofs.to(self.device) for batch in batch_list], dim=0 + ) + target_normed = self.normalizers["target"].norm(energy_target, hofs=hofs) else: target_normed = energy_target energy_mult = self.config["optim"].get("energy_coefficient", 1) @@ -609,10 +621,19 @@ def compute_metrics( ) if self.normalizer.get("normalize_labels") and "target" in self.normalizers: + hofs = None + if self.task_name == "qm7x": + hofs = torch.cat( + [batch.hofs.to(self.device) for batch in batch_list], dim=0 + ) if not self.config.get("no_metrics_denorm"): - preds["energy"] = self.normalizers["target"].denorm(preds["energy"]) + preds["energy"] = self.normalizers["target"].denorm( + preds["energy"], hofs=hofs + ) else: - target["energy"] = self.normalizers["target"].norm(target["energy"]) + target["energy"] = self.normalizers["target"].norm( + target["energy"], hofs=hofs + ) metrics = evaluator.eval(preds, target, prev_metrics=metrics) diff --git a/scripts/compute_qm7x_rescales.py b/scripts/compute_qm7x_rescales.py new file mode 100644 index 0000000000..b28c7a4c15 --- /dev/null +++ b/scripts/compute_qm7x_rescales.py @@ -0,0 +1,51 @@ +import json +import os +from pathlib import Path + +import numpy as np +from mendeleev.fetch import fetch_table +from tqdm import tqdm + +os.path.append(Path(__file__).resolve().parent.parent) + +from ocpmodels.common.utils import ( + ROOT, + base_config, + move_lmdb_data_to_slurm_tmpdir, +) +from ocpmodels.trainers.single_trainer import SingleTrainer + +if __name__ == "__main__": + config = base_config("schnet-qm7x-all") + config["cp_data_to_tmpdir"] = True + config = move_lmdb_data_to_slurm_tmpdir(config) + trainer = SingleTrainer(**config) + + df = fetch_table("elements") + HOF = df.set_index("atomic_number")["heat_of_formation"].values + non_nan_hof_mean = HOF[~np.isnan(HOF)].mean() + print("non_nan_hof_mean: ", non_nan_hof_mean) # 353.3106853932584 + HOF[np.isnan(HOF)] = non_nan_hof_mean + + hofs = [] + + for batch_list in tqdm(trainer.loaders["train"]): + hofs += [ + y / HOF[z.astype(int) - 1].sum() + for y, z in zip(batch_list[0].y, batch_list[0].atNUM) + ] + + mean = np.mean(hofs) + std = np.std(hofs) + + (ROOT / "configs" / "models" / "qm7x-metadata" / "hof_rescales.json").write_text( + json.dumps( + { + "mean": float(mean), + "std": float(std), + "about": "Statistics for y(=ePBE0+MBD) / sum(HOF) " + + "where HOF is the heat of formation of each element in the graph." + + " This is computed over the train set only.", + } + ) + ) From b0deb50792570807a01a1f742901dbfd76e34d30 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:17:44 -0500 Subject: [PATCH 171/273] fix path --- configs/exps/qm7x/schnet-fanet.yaml | 97 +++++++++++++++-------------- ocpmodels/common/exp_manager.py | 3 + 2 files changed, 53 insertions(+), 47 deletions(-) diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml index ece614e5f3..10c22056b1 100644 --- a/configs/exps/qm7x/schnet-fanet.yaml +++ b/configs/exps/qm7x/schnet-fanet.yaml @@ -52,6 +52,9 @@ default: num_interactions: 6 cutoff: 5.0 regress_forces: from_energy + dataset: + train: + rescale_with_hof: True runs: - config: schnet-qm7x-all @@ -95,53 +98,53 @@ runs: lr_initial: 0.001 batch_size: 256 - # - config: fanet-qm7x-all - # model: - # graph_norm: true - # edge_embed_type: all_rij - # mp_type: updownscale_base + - config: fanet-qm7x-all + model: + graph_norm: true + edge_embed_type: all_rij + mp_type: updownscale_base - # - config: fanet-qm7x-all - # optim: - # energy_coefficient: 0.01 - # energy_grad_coefficient: 0.1 - # force_coefficient: 0.89 - # lr_initial: 0.001 - # batch_size: 100 - # model: - # graph_norm: true - # edge_embed_type: all_rij - # mp_type: updownscale_base - # force_decoder_type: mlp - # regress_forces: direct_with_gradient_target + - config: fanet-qm7x-all + optim: + energy_coefficient: 0.01 + energy_grad_coefficient: 0.1 + force_coefficient: 0.89 + lr_initial: 0.001 + batch_size: 100 + model: + graph_norm: true + edge_embed_type: all_rij + mp_type: updownscale_base + force_decoder_type: mlp + regress_forces: direct_with_gradient_target - # - config: fanet-qm7x-all - # optim: - # energy_coefficient: 0.01 - # energy_grad_coefficient: 0.1 - # force_coefficient: 0.89 - # lr_initial: 0.001 - # batch_size: 100 - # model: - # graph_norm: false - # force_decoder_type: mlp - # edge_embed_type: all_rij - # regress_forces: direct_with_gradient_target - # num_interactions: 4 + - config: fanet-qm7x-all + optim: + energy_coefficient: 0.01 + energy_grad_coefficient: 0.1 + force_coefficient: 0.89 + lr_initial: 0.001 + batch_size: 100 + model: + graph_norm: false + force_decoder_type: mlp + edge_embed_type: all_rij + regress_forces: direct_with_gradient_target + num_interactions: 4 - # - config: fanet-qm7x-all - # optim: - # energy_coefficient: 0.01 - # energy_grad_coefficient: 0.1 - # force_coefficient: 0.89 - # lr_initial: 0.001 - # batch_size: 100 - # model: - # graph_norm: true - # force_decoder_type: mlp - # mp_type: updownscale_base - # edge_embed_type: all_rij - # regress_forces: direct_with_gradient_target - # num_interactions: 3 - # num_filters: 256 - # hidden_channels: 256 + - config: fanet-qm7x-all + optim: + energy_coefficient: 0.01 + energy_grad_coefficient: 0.1 + force_coefficient: 0.89 + lr_initial: 0.001 + batch_size: 100 + model: + graph_norm: true + force_decoder_type: mlp + mp_type: updownscale_base + edge_embed_type: all_rij + regress_forces: direct_with_gradient_target + num_interactions: 3 + num_filters: 256 + hidden_channels: 256 diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index b90e473742..37370c8c69 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -10,6 +10,9 @@ from datetime import datetime import yaml from tqdm import tqdm + +os.path.append(Path(__file__).resolve().parent.parent.parent) + from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path EXP_OUT_DIR = ROOT / "data" / "exp_outputs" From 3ce82f4767fc1ca5f830bc137215e5a9e8f5f295 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:18:47 -0500 Subject: [PATCH 172/273] sys not os --- ocpmodels/common/exp_manager.py | 2 +- scripts/compute_qm7x_rescales.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 37370c8c69..4ce5f560bc 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -11,7 +11,7 @@ import yaml from tqdm import tqdm -os.path.append(Path(__file__).resolve().parent.parent.parent) +sys.path.append(Path(__file__).resolve().parent.parent.parent) from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path diff --git a/scripts/compute_qm7x_rescales.py b/scripts/compute_qm7x_rescales.py index b28c7a4c15..7713d360f9 100644 --- a/scripts/compute_qm7x_rescales.py +++ b/scripts/compute_qm7x_rescales.py @@ -1,12 +1,12 @@ import json -import os +import sys from pathlib import Path import numpy as np from mendeleev.fetch import fetch_table from tqdm import tqdm -os.path.append(Path(__file__).resolve().parent.parent) +sys.path.append(Path(__file__).resolve().parent.parent) from ocpmodels.common.utils import ( ROOT, From f266f8b825b20b7ce7d57b307534f232da01fa6e Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:20:53 -0500 Subject: [PATCH 173/273] str paths --- ocpmodels/common/exp_manager.py | 2 +- scripts/compute_qm7x_rescales.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 4ce5f560bc..a67a48f24f 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -11,7 +11,7 @@ import yaml from tqdm import tqdm -sys.path.append(Path(__file__).resolve().parent.parent.parent) +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path diff --git a/scripts/compute_qm7x_rescales.py b/scripts/compute_qm7x_rescales.py index 7713d360f9..7e91bd53ae 100644 --- a/scripts/compute_qm7x_rescales.py +++ b/scripts/compute_qm7x_rescales.py @@ -6,7 +6,7 @@ from mendeleev.fetch import fetch_table from tqdm import tqdm -sys.path.append(Path(__file__).resolve().parent.parent) +sys.path.append(str(Path(__file__).resolve().parent.parent)) from ocpmodels.common.utils import ( ROOT, From bcc2e0eddeaaa32a2f24c3d47041f91bc706fd3c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:23:11 -0500 Subject: [PATCH 174/273] finished first in exp manager --- ocpmodels/common/exp_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index a67a48f24f..16e19167bd 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -202,7 +202,9 @@ def parse_output_files(self): continue out_txt = out_file.read_text() - if "RaceCondition" in out_txt: + if "eval_all_splits" in out_txt and "Final results" in out_txt: + self.cache["job_state"][j] = "Finished" + elif "RaceCondition" in out_txt: self.cache["job_state"][j] = "RaceCondition" elif "Traceback" in out_txt: self.cache["job_state"][j] = ( @@ -211,8 +213,6 @@ def parse_output_files(self): elif "srun: Job step aborted" in out_txt: if "slurmstepd" in out_txt and " CANCELLED AT " in out_txt: self.cache["job_state"][j] = "Cancelled" - elif "eval_all_splits" in out_txt and "Final results" in out_txt: - self.cache["job_state"][j] = "Finished" elif "nan_loss" in out_txt: self.cache["job_state"][j] = "NaN loss" else: From 137815426d6c68346907c3d8b673b2ae007a08fc Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:25:03 -0500 Subject: [PATCH 175/273] 800 warmup epochs --- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index 88e871f0a4..ed94f4fddc 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -21,7 +21,7 @@ default: # early stopping es_patience: 20 es_min_abs_change: 0.000001 - es_warmup_epochs: 500 + es_warmup_epochs: 800 # all below is for the scheduler scheduler: ReduceLROnPlateau mode: min From 37ae180f4611203b368797b7af1df58f9250ff43 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:28:34 -0500 Subject: [PATCH 176/273] parse running/waiting jobs --- ocpmodels/common/exp_manager.py | 46 +++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 16e19167bd..e760c090b1 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -79,6 +79,23 @@ def __init__( self.job_ids = sorted( [p.name for runs in self.trial_hparams_to_rundirs.values() for p in runs] ) + sq_cmd = ( + "/opt/slurm/bin/squeue" + if "CC_CLUSTER" not in os.environ + else "/opt/software/slurm/bin/squeue" + ) + sq = set( + [ + j.strip() + for j in os.popen(f"{sq_cmd} -u $USER -o '%12i'") + .read() + .splitlines()[1:] + ] + ) + self.running_jobs = set(self.job_ids) & sq + self.waiting_jobs = ( + set([j.parent.name for j in RUN_DIR.glob(f"*/{self.name}.exp")]) & sq + ) - self.running_jobs print("\n") self.discover_yamls() self.discover_job_ids_from_yaml() @@ -127,33 +144,17 @@ def print_status(self): ) print("{:32} : {:4}".format("Existing wandb runs", len(self.wandb_runs))) print("{:32} : {}".format("Algorithm's budgets", str(self.budgets))) - sq_cmd = ( - "/opt/slurm/bin/squeue" - if "CC_CLUSTER" not in os.environ - else "/opt/software/slurm/bin/squeue" - ) - sq = set( - [ - j.strip() - for j in os.popen(f"{sq_cmd} -u $USER -o '%12i'") - .read() - .splitlines()[1:] - ] - ) - running = set(self.job_ids) & sq - waiting = ( - set([j.parent.name for j in RUN_DIR.glob(f"*/{self.name}.exp")]) & sq - ) - running + print( "{:32} : {}".format( "Jobs currently running:", - f"{len(running)} " + " ".join(sorted(running)), + f"{len(self.running_jobs)} " + " ".join(sorted(self.running_jobs)), ) ) print( "{:32} : {}".format( "Jobs currently waiting:", - f"{len(waiting)} " + " ".join(sorted(waiting)), + f"{len(self.waiting_jobs)} " + " ".join(sorted(self.waiting_jobs)), ) ) @@ -216,7 +217,12 @@ def parse_output_files(self): elif "nan_loss" in out_txt: self.cache["job_state"][j] = "NaN loss" else: - self.cache["job_state"][j] = "Unknown" + if j in self.waiting_jobs: + self.cache["job_state"][j] = "Waiting" + if j in self.running_jobs: + self.cache["job_state"][j] = "Running" + else: + self.cache["job_state"][j] = "Unknown" self.commit_cache() def print_output_files_stats(self): From 92dfa9a7b545c1802caf5558ada1cf4d1699d46f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:29:21 -0500 Subject: [PATCH 177/273] parse running/waiting jobs --- ocpmodels/common/exp_manager.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index e760c090b1..fae22c99c3 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -196,6 +196,12 @@ def parse_output_files(self): for j in tqdm(self.cache["all_job_ids"], desc="Parsing output files"): if j in self.cache["job_state"] and not self.rebuild_cache: continue + if j in self.waiting_jobs: + self.cache["job_state"][j] = "Waiting" + continue + if j in self.running_jobs: + self.cache["job_state"][j] = "Running" + continue out_file = RUN_DIR / j / "output-0.txt" if not out_file.exists(): @@ -217,12 +223,7 @@ def parse_output_files(self): elif "nan_loss" in out_txt: self.cache["job_state"][j] = "NaN loss" else: - if j in self.waiting_jobs: - self.cache["job_state"][j] = "Waiting" - if j in self.running_jobs: - self.cache["job_state"][j] = "Running" - else: - self.cache["job_state"][j] = "Unknown" + self.cache["job_state"][j] = "Unknown" self.commit_cache() def print_output_files_stats(self): From 7a496ea0e8a2689fe1003207caad1728d476b8cb Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:31:50 -0500 Subject: [PATCH 178/273] parse time limit --- ocpmodels/common/exp_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index fae22c99c3..7a46e9166d 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -211,6 +211,8 @@ def parse_output_files(self): out_txt = out_file.read_text() if "eval_all_splits" in out_txt and "Final results" in out_txt: self.cache["job_state"][j] = "Finished" + elif "DUE TO TIME LIMIT" in out_txt: + self.cache["job_state"][j] = "TimeLimit" elif "RaceCondition" in out_txt: self.cache["job_state"][j] = "RaceCondition" elif "Traceback" in out_txt: From 61114adcf236534287a5fec047b2ed78105891a8 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:34:06 -0500 Subject: [PATCH 179/273] improve ES --- configs/exps/icml/qm9/fanet-orion-qm9.yaml | 4 ++-- ocpmodels/modules/scheduler.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml index ed94f4fddc..bfcf94caa6 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml @@ -25,10 +25,10 @@ default: # all below is for the scheduler scheduler: ReduceLROnPlateau mode: min - factor: 0.5 + factor: 0.75 threshold: 0.0001 threshold_mode: abs - min_lr: 0.00001 + min_lr: 0.000001 verbose: true note: model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index 7c0a5fc071..ca440b1854 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -178,15 +178,16 @@ def should_stop(self, metric, lr=None, epoch=None): else: self.counter += 1 - if self.warmup_epochs > 0 and epoch is not None and epoch < self.warmup_epochs: - self.counter = 0 - if self.counter >= self.patience: self.early_stop = "metric" if lr is not None and lr <= self.min_lr: self.early_stop = "lr" + if self.warmup_epochs > 0 and epoch is not None and epoch < self.warmup_epochs: + self.early_stop = "" + self.counter = 0 + return self.early_stop @property From d341351438e535494f303b076dd4391a073e07be Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:35:28 -0500 Subject: [PATCH 180/273] parse DatabaseTimeout --- ocpmodels/common/exp_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 7a46e9166d..5c072467b6 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -215,6 +215,8 @@ def parse_output_files(self): self.cache["job_state"][j] = "TimeLimit" elif "RaceCondition" in out_txt: self.cache["job_state"][j] = "RaceCondition" + elif "DatabaseTimeout: Could not acquire lock for PickledDB" in out_txt: + self.cache["job_state"][j] = "DatabaseTimeout" elif "Traceback" in out_txt: self.cache["job_state"][j] = ( "Traceback: " + out_txt.split("Traceback")[1] From 27b0eb0f0329a1c6b70bffc5cc2e3c931a9ad646 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:36:49 -0500 Subject: [PATCH 181/273] parse WaitingForTrials --- ocpmodels/common/exp_manager.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 5c072467b6..85916787ee 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -217,6 +217,11 @@ def parse_output_files(self): self.cache["job_state"][j] = "RaceCondition" elif "DatabaseTimeout: Could not acquire lock for PickledDB" in out_txt: self.cache["job_state"][j] = "DatabaseTimeout" + elif ( + "Algo does not have more trials to sample.Waiting for current trials to finish" # noqa: E501 + in out_txt + ): + self.cache["job_state"][j] = "WaitingForTrials" elif "Traceback" in out_txt: self.cache["job_state"][j] = ( "Traceback: " + out_txt.split("Traceback")[1] From 43da4ee65c266f185541212c32f1bf649927ff59 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 20:39:19 -0500 Subject: [PATCH 182/273] cleaner prints --- ocpmodels/common/exp_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 85916787ee..6fae71cf91 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -251,7 +251,7 @@ def print_output_files_stats(self): stats[o]["n"] += 1 stats[o]["ids"].append(j) for s, v in stats.items(): - print(f"• {s:31}" + f": {v['n']} (" + " ".join(v["ids"]) + ")") + print(f"\n• {s:31}" + f": {v['n']}\n " + " ".join(v["ids"])) if stats["Traceback"]["n"] > 0 and self.print_tracebacks: print("\nTraceback contents:\n" + "-" * 19 + "\n") print( From 2de50ce7f40e787c0fc0f47cd4b0500e6961cade Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 22:44:26 -0500 Subject: [PATCH 183/273] typo in normalizer state dict --- ocpmodels/modules/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/modules/normalizer.py b/ocpmodels/modules/normalizer.py index bbe169eee9..df2830e276 100644 --- a/ocpmodels/modules/normalizer.py +++ b/ocpmodels/modules/normalizer.py @@ -54,7 +54,7 @@ def denorm(self, normed_tensor, hofs=None): def state_dict(self): sd = {"mean": self.mean, "std": self.std} - if self.hof_rescales: + if self.hof_mean is not None: sd["hof_rescales"] = { "mean": self.hof_mean, "std": self.hof_std, From 64da2d0bd8609178c5484b37d89fd9d8b23ab49c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 23:02:53 -0500 Subject: [PATCH 184/273] enable restart from dir --- ocpmodels/common/flags.py | 6 ++++ ocpmodels/common/utils.py | 49 ++++++++++++++++++++++-------- ocpmodels/trainers/base_trainer.py | 8 ++--- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index f7da16d626..2a8caa3e91 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -89,6 +89,12 @@ def add_core_args(self): self.parser.add_argument( "--continue_from_dir", type=str, help="Run to continue, loading its config" ) + self.parser.add_argument( + "--restart_from_dir", + type=str, + help="Run to restart, loading its config and overwriting " + + "from the command-line", + ) self.parser.add_argument( "--timestamp-id", default=None, diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index b7e337843f..f0de988127 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -282,7 +282,7 @@ def read_slurm_env(config): return config -def continue_from_slurm_job_id(config): +def continue_from_slurm_job_id(config, from_best=False): """ Assuming runs are consistently executed in a `run_dir` with the `run_dir/$SLURM_JOBID` pattern, this functions looks for an existing @@ -298,6 +298,8 @@ def continue_from_slurm_job_id(config): Args: config (dict): The original config to overwrite + from_best (bool, optional): If True, only looks for `best_checkpoint.pt`. + otherwise, looks for the latest checkpoint. Defaults to False. Returns: dict: The updated config if a checkpoint has been found @@ -314,9 +316,12 @@ def continue_from_slurm_job_id(config): if not ckpt_dir.exists() or not ckpt_dir.is_dir(): return config - best_ckp = ckpt_dir / "best_checkpoint.pt" - if best_ckp.exists(): - config["checkpoint"] = str(best_ckp) + if from_best: + best_ckp = ckpt_dir / "best_checkpoint.pt" + if best_ckp.exists(): + ckpt = str(best_ckp) + else: + raise FileNotFoundError(f"No best checkpoint found in {str(ckpt_dir)}") else: ckpts = list(ckpt_dir.glob("checkpoint-*.pt")) if not ckpts: @@ -325,7 +330,11 @@ def continue_from_slurm_job_id(config): ckpts, key=lambda f: float(f.stem.split("checkpoint-")[-1]) )[-1] if latest_ckpt.exists() and latest_ckpt.is_file(): - config["checkpoint"] = str(latest_ckpt) + ckpt = str(latest_ckpt) + + if ckpt: + config["checkpoint"] = ckpt + print(f"\n🎁 Resuming based on $SLURM_JOB_ID {JOB_ID} from {ckpt}\n") return config @@ -1108,8 +1117,16 @@ def build_config(args, args_override): if args_override != []: overrides = create_dict_from_args(args_override) - if args.continue_from_dir: - cont_dir = resolve(args.continue_from_dir) + if args.continue_from_dir or args.restart_from_dir: + if args.continue_from_dir and args.restart_from_dir: + raise ValueError( + "Cannot specify both --continue_from_dir and --restart_from_dir." + ) + cont_dir = ( + resolve(args.continue_from_dir) + if args.continue_from_dir + else resolve(args.restart_from_dir) + ) ckpts = list(cont_dir.glob("checkpoints/checkpoint-*.pt")) if not ckpts: print( @@ -1120,9 +1137,15 @@ def build_config(args, args_override): latest_ckpt = str( sorted(ckpts, key=lambda c: float(c.stem.split("-")[-1]))[-1] ) - continue_config["checkpoint"] = str(latest_ckpt) + if args.continue_from_dir: + continue_config["checkpoint"] = str(latest_ckpt) continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] - print("✅ Loading config from cont dir and latest checkpoint:", latest_ckpt) + print( + f"✅ Loading config from directory {str(cont_dir)}" + + f" and latest checkpoint: {latest_ckpt}" + if args.continue_from_dir + else "" + ) args.config = continue_config["config"] config = load_config(args.config) @@ -1137,13 +1160,13 @@ def build_config(args, args_override): config["world_size"] = args.num_nodes * args.num_gpus if continue_config: - dirs_k_v = [(k, v) for k, v in config.items() if "dir" in k] - dataset_config = copy.deepcopy(config["dataset"]) + new_dirs = [(k, v) for k, v in config.items() if "dir" in k] + # dataset_config = copy.deepcopy(config["dataset"]) config = merge_dicts( continue_config, - {k: resolve(v) if isinstance(v, str) else v for k, v in dirs_k_v}, + {k: resolve(v) if isinstance(v, str) else v for k, v in new_dirs}, ) - config["dataset"] = dataset_config + # config["dataset"] = dataset_config config = merge_dicts(config, cli_args_dict()) config = merge_dicts(config, overrides) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 507d8eefc9..c22ef01139 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -320,10 +320,10 @@ def load_datasets(self): std=self.normalizer["target_std"], device=self.device, ) - if "hof_rescales" in self.normalizer: - self.normalizers["target"].set_hof_rescales( - self.normalizer["hof_rescales"] - ) + if "hof_rescales" in self.normalizer: + self.normalizers["target"].set_hof_rescales( + self.normalizer["hof_rescales"] + ) else: self.normalizers["target"] = Normalizer( tensor=self.datasets["train"].data.y[ From ef0715db083ab754147830a98035a9996a48b5a3 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 23:14:00 -0500 Subject: [PATCH 185/273] raise value error in case of missing ocnfig arg --- ocpmodels/common/utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index f0de988127..2c201644f0 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1142,12 +1142,19 @@ def build_config(args, args_override): continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] print( f"✅ Loading config from directory {str(cont_dir)}" - + f" and latest checkpoint: {latest_ckpt}" - if args.continue_from_dir - else "" + + ( + f" and latest checkpoint: {latest_ckpt}" + if args.continue_from_dir + else " (restarting from scratch)" + ) ) args.config = continue_config["config"] + if args.config is None: + raise ValueError( + "Must specify a config file with " + f"--config. Received args: {args}" + ) + config = load_config(args.config) config = merge_dicts(config, args_dict_with_defaults) config = merge_dicts(config, overrides) From d4d00a6569e6ea8923bc5be8e01e62a1e4aafc94 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 23:34:27 -0500 Subject: [PATCH 186/273] qm9 fanet v4 --- configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml new file mode 100644 index 0000000000..92774ccd0f --- /dev/null +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml @@ -0,0 +1,68 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 8GB + cpus: 4 + gres: gpu:1 + time: 02:50:00 + partition: long + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, orion + log_train_every: 100 + optim: + batch_size: 32 + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + loss_energy: mse + # early stopping + es_patience: 20 + es_min_abs_change: 0.000001 + es_warmup_epochs: 800 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: num_filters, pg_hidden_channels, phys_hidden_channels, num_gaussians + frame_averaging: 3D + fa_frames: random + model: + edge_embed_type: all_rij + energy_head: "" + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 20 + + unique_exp_name: fanet-qm9-v4.0.0 + + space: + optim/max_epochs: fidelity(200, 2000, base=5) + optim/lr_initial: loguniform(1e-4, 6e-4, precision=3) + model/graph_norm: choices([True, False]) + model/hidden_channels: uniform(5, 15, discrete=True) + model/mp_type: choices(["simple", "base", "updownscale", "updownscale_base"]) + model/num_filters: uniform(4, 16, discrete=True) + model/num_gaussians: uniform(1, 4, discrete=True) + model/num_interactions: uniform(3, 5, discrete=True) + model/pg_hidden_channels: uniform(0, 1, discrete=True) + model/phys_embeds: choices([True, False]) + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 From b752916a2d30a387a3982a8d29a27b91b7e8b4ce Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 23:46:18 -0500 Subject: [PATCH 187/273] min hidden_channels --- configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml index 92774ccd0f..b43d252a8e 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml @@ -48,13 +48,13 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 20 - unique_exp_name: fanet-qm9-v4.0.0 + unique_exp_name: fanet-qm9-v4.0.1 space: optim/max_epochs: fidelity(200, 2000, base=5) optim/lr_initial: loguniform(1e-4, 6e-4, precision=3) model/graph_norm: choices([True, False]) - model/hidden_channels: uniform(5, 15, discrete=True) + model/hidden_channels: uniform(6, 15, discrete=True) model/mp_type: choices(["simple", "base", "updownscale", "updownscale_base"]) model/num_filters: uniform(4, 16, discrete=True) model/num_gaussians: uniform(1, 4, discrete=True) From 585b5bee94f49ecea04020b245899135c23bc5aa Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 18 Jan 2023 23:52:42 -0500 Subject: [PATCH 188/273] report 1e12 for error in trainer init --- main.py | 40 +++++++++++++++++++++++---------- ocpmodels/common/exp_manager.py | 5 +++++ 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/main.py b/main.py index dc763ce3de..ea6aa210ac 100644 --- a/main.py +++ b/main.py @@ -8,10 +8,8 @@ import copy import logging import os -import shutil import time import traceback -import warnings import torch from orion.core.utils.exceptions import ReservationRaceCondition @@ -65,7 +63,7 @@ def __init__(self, trainer_config): self.hparams = {} def run(self, orion_exp=None): - orion_trial = None + orion_trial = signal = None self.original_config = copy.deepcopy(self.trainer_config) orion_race_condition = False if dist_utils.is_master(): @@ -110,31 +108,49 @@ def run(self, orion_exp=None): self.trainer_config = continue_orion_exp(self.trainer_config) self.trainer_config = auto_note(self.trainer_config) cls = registry.get_trainer_class(self.trainer_config["trainer"]) - self.trainer: BaseTrainer = cls(**self.trainer_config) - task = registry.get_task_class(self.trainer_config["mode"])(self.trainer_config) - task.setup(self.trainer) - start_time = time.time() - print_warnings() + try: + self.trainer: BaseTrainer = cls(**self.trainer_config) + except Exception as e: + print(f"Error in trainer initialization: {e}") + traceback.print_exc() + signal = "trainer_init_error" + + if signal is None: + task = registry.get_task_class(self.trainer_config["mode"])( + self.trainer_config + ) + task.setup(self.trainer) + start_time = time.time() + print_warnings() - signal = task.run() + signal = task.run() # handle job preemption / time limit if signal == "SIGTERM": print("\nJob was preempted. Wrapping up...\n") - self.trainer.close_datasets() + if self.trainer: + self.trainer.close_datasets() dist_utils.synchronize() logging.info(f"Total time taken: {time.time() - start_time}") - if self.trainer.logger is not None: + if self.trainer and self.trainer.logger is not None: self.trainer.logger.log({"Total time": time.time() - start_time}) - objective = dist_utils.broadcast_from_master(self.trainer.objective) + objective = dist_utils.broadcast_from_master( + self.trainer.objective if self.trainer else None + ) if orion_exp is not None: if objective is None: if signal == "loss_is_nan": objective = 1e12 print("Received NaN objective from worker. Setting to 1e12.") + if signal == "trainer_init_error": + objective = 1e12 + print( + "Received trainer_init_error from worker.", + "Setting objective to 1e12.", + ) else: print("Received None objective from worker. Skipping observation.") if objective is not None: diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 6fae71cf91..6a2bead459 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -222,6 +222,11 @@ def parse_output_files(self): in out_txt ): self.cache["job_state"][j] = "WaitingForTrials" + elif ( + "RuntimeError: Trying to create tensor with negative dimension" + in out_txt + ): + self.cache["job_state"][j] = "NegativeEmbeddingDimension" elif "Traceback" in out_txt: self.cache["job_state"][j] = ( "Traceback: " + out_txt.split("Traceback")[1] From 04487ec37b5eb86f0eca5151dfa041c88325e17a Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 00:00:39 -0500 Subject: [PATCH 189/273] fix unbounded var start_time --- main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index ea6aa210ac..ac749e1d2e 100644 --- a/main.py +++ b/main.py @@ -115,12 +115,12 @@ def run(self, orion_exp=None): traceback.print_exc() signal = "trainer_init_error" + start_time = time.time() if signal is None: task = registry.get_task_class(self.trainer_config["mode"])( self.trainer_config ) task.setup(self.trainer) - start_time = time.time() print_warnings() signal = task.run() @@ -132,9 +132,10 @@ def run(self, orion_exp=None): self.trainer.close_datasets() dist_utils.synchronize() - logging.info(f"Total time taken: {time.time() - start_time}") + total_time = time.time() - start_time + logging.info(f"Total time taken: {total_time}") if self.trainer and self.trainer.logger is not None: - self.trainer.logger.log({"Total time": time.time() - start_time}) + self.trainer.logger.log({"Total time": total_time}) objective = dist_utils.broadcast_from_master( self.trainer.objective if self.trainer else None From 6607071d9bd2c1e7a9d4f7caa765621527e66ebe Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Thu, 19 Jan 2023 03:44:13 -0500 Subject: [PATCH 190/273] orion and top config --- .../exps/icml/is2re-all/fanet-orion-2.yaml | 2 +- .../exps/icml/is2re-all/fanet-orion-3.yaml | 8 +- .../exps/icml/is2re-all/fanet-orion-4.yaml | 60 ++++ configs/exps/icml/is2re-all/top-config-2.yaml | 263 ++++++++++++++++ configs/exps/icml/is2re-all/top-config.yaml | 286 ++++++++++++++++-- scripts/gnn_dev.py | 3 +- 6 files changed, 589 insertions(+), 33 deletions(-) create mode 100644 configs/exps/icml/is2re-all/fanet-orion-4.yaml create mode 100644 configs/exps/icml/is2re-all/top-config-2.yaml diff --git a/configs/exps/icml/is2re-all/fanet-orion-2.yaml b/configs/exps/icml/is2re-all/fanet-orion-2.yaml index cf88591af6..a41f774433 100644 --- a/configs/exps/icml/is2re-all/fanet-orion-2.yaml +++ b/configs/exps/icml/is2re-all/fanet-orion-2.yaml @@ -32,7 +32,7 @@ default: orion: # Remember to change the experiment name if you change anything in the search space - n_jobs: 72 + n_jobs: 12 unique_exp_name: fanet-is2re-all-v2 diff --git a/configs/exps/icml/is2re-all/fanet-orion-3.yaml b/configs/exps/icml/is2re-all/fanet-orion-3.yaml index 8daecd138c..f57167d004 100644 --- a/configs/exps/icml/is2re-all/fanet-orion-3.yaml +++ b/configs/exps/icml/is2re-all/fanet-orion-3.yaml @@ -3,7 +3,7 @@ job: mem: 32GB cpus: 4 gres: gpu:rtx8000:1 - time: 14:00:00 + time: 12:00:00 partition: long default: @@ -30,9 +30,9 @@ default: orion: # Remember to change the experiment name if you change anything in the search space - n_jobs: 72 + n_jobs: 216 - unique_exp_name: fanet-is2re-all-v2 + unique_exp_name: fanet-is2re-all-v3 space: model/complex_mp: choices([True, False]) @@ -49,7 +49,7 @@ orion: model/pg_hidden_channels: uniform(1, 2, discrete=True) model/phys_embeds: choices([True, False]) model/second_layer_MLP: choices([True, False]) - model/skip_co: choices(["concat", False]) + model/skip_co: choices(["concat", False, "concat-atom"]) model/tag_hidden_channels: uniform(1, 2, discrete=True) optim/lr_initial: loguniform(9e-4, 5e-3, precision=2) optim/max_epochs: fidelity(7, 15, base=6) diff --git a/configs/exps/icml/is2re-all/fanet-orion-4.yaml b/configs/exps/icml/is2re-all/fanet-orion-4.yaml new file mode 100644 index 0000000000..f86ea559f4 --- /dev/null +++ b/configs/exps/icml/is2re-all/fanet-orion-4.yaml @@ -0,0 +1,60 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + time: 12:00:00 + partition: long + +default: + wandb_project: ocp-3 + config: fanet-is2re-all + mode: train + test_ri: True + wandb_tags: is2re-all, orion-3 + cp_data_to_tmpdir: true + graph_rewiring: remove-tag-0 + model: + graph_norm: True + frame_averaging: 2D + optim: + scheduler: LinearWarmupCosineAnnealingLR + eval_every: 0.5 + lr_initial: 0.002 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co + optim: lr_initial, warmup_steps + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 150 + + unique_exp_name: fanet-is2re-all-v4 + + space: + model/complex_mp: choices([True, False]) + model/cutoff: choices([4.0, 6.0]) + model/edge_embed_type: choices(["all_rij"]) + model/energy_head: choices(["weighted-av-final-embeds"]) + model/fa_frames: choices(["random", "se3-random"]) + model/hidden_channels: uniform(9, 17, discrete=True) + model/max_num_neighbors: choices([30, 40, 50]) + model/mp_type: choices(["base", "updownscale", "updownscale_base", "updown_local_env"]) + model/num_filters: uniform(4, 15, discrete=True) + model/num_gaussians: uniform(40, 140, discrete=True) + model/num_interactions: uniform(4, 8, discrete=True) + model/pg_hidden_channels: uniform(1, 3, discrete=True) + model/phys_embeds: choices([True, False]) + model/second_layer_MLP: choices([True, False]) + model/skip_co: choices(["concat", False, "concat-atom"]) + model/tag_hidden_channels: uniform(1, 2, discrete=True) + optim/max_epochs: fidelity(6, 12, base=6) + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 1 diff --git a/configs/exps/icml/is2re-all/top-config-2.yaml b/configs/exps/icml/is2re-all/top-config-2.yaml new file mode 100644 index 0000000000..00bf54b35b --- /dev/null +++ b/configs/exps/icml/is2re-all/top-config-2.yaml @@ -0,0 +1,263 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 15:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + edge_embed_type: all_rij + wandb_tags: 'best-config' + optim: + batch_size: 256 + eval_batch_size: 256 + max_epochs: 9 + cp_data_to_tmpdir: true + +runs: + + - config: fanet-is2re-all # 2700544 + note: 'top-02-modif' + frame_averaging: 2D + fa_frames: random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 300 + num_filters: 300 + num_gaussians: 70 + num_interactions: 6 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.0022 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 8 + eval_every: 0.4 + + - config: fanet-is2re-all + note: 'top-1-modif' + frame_averaging: 2D + fa_frames: random + model: + mp_type: updownscale + phys_embeds: False + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: False + graph_norm: true + hidden_channels: 352 + num_filters: 448 + num_gaussians: 99 + num_interactions: 8 + second_layer_MLP: True + skip_co: concat + optim: + lr_initial: 0.0019 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-1-modif' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: False + graph_norm: True + hidden_channels: 375 + num_filters: 448 + num_gaussians: 110 + num_interactions: 6 + skip_co: concat + optim: + lr_initial: 0.0025 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-2-modif' + frame_averaging: 2D + fa_frames: random + model: + mp_type: updown_local_env + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: False + graph_norm: True + hidden_channels: 192 + num_filters: 480 + num_gaussians: 98 + num_interactions: 5 + second_layer_MLP: True + skip_co: add + optim: + lr_initial: 0.0027 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-3' + frame_averaging: 2D + fa_frames: random + model: + mp_type: updown_local_env + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 288 + num_filters: 480 + num_gaussians: 45 + num_interactions: 5 + second_layer_MLP: False + skip_co: False + cutoff: 4.0 + optim: + lr_initial: 0.003 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-3-modif' + frame_averaging: 2D + fa_frames: random + model: + mp_type: updown_local_env + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 288 + num_filters: 480 + num_gaussians: 90 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.003 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-4' + frame_averaging: 2D + fa_frames: random + model: + mp_type: base + phys_embeds: False + tag_hidden_channels: 0 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 384 + num_filters: 384 + num_gaussians: 77 + num_interactions: 4 + second_layer_MLP: False + skip_co: False + cutoff: 10.0 + optim: + lr_initial: 0.0025 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-4-modif' + frame_averaging: 2D + fa_frames: random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 384 + num_filters: 384 + num_gaussians: 90 + num_interactions: 4 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.0025 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-5' + frame_averaging: 2D + fa_frames: random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 32 + energy_head: False + complex_mp: True + graph_norm: True + hidden_channels: 320 + num_filters: 416 + num_gaussians: 36 + num_interactions: 6 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + lr_initial: 0.0034 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-5-modif' + frame_averaging: 2D + fa_frames: random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 350 + num_filters: 416 + num_gaussians: 80 + num_interactions: 6 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.0034 + scheduler: LinearWarmupCosineAnnealingLR + - config: fanet-is2re-all + note: 'top-6' + frame_averaging: 2D + fa_frames: random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 480 + num_filters: 352 + num_gaussians: 72 + num_interactions: 5 + second_layer_MLP: False + skip_co: False + cutoff: 6.0 + optim: + lr_initial: 0.0034 + scheduler: LinearWarmupCosineAnnealingLR \ No newline at end of file diff --git a/configs/exps/icml/is2re-all/top-config.yaml b/configs/exps/icml/is2re-all/top-config.yaml index 0debe73c19..845ef052ff 100644 --- a/configs/exps/icml/is2re-all/top-config.yaml +++ b/configs/exps/icml/is2re-all/top-config.yaml @@ -39,11 +39,11 @@ runs: optim: lr_initial: 0.0019 scheduler: LinearWarmupCosineAnnealingLR - max_epochs: 9 + max_epochs: 20 + eval_every: 0.4 - config: fanet-is2re-all note: 'top-1-FA' - frame_averaging: 2D - fa_frames: all + frame_averaging: DA model: mp_type: updownscale phys_embeds: False @@ -61,47 +61,279 @@ runs: optim: lr_initial: 0.0019 scheduler: LinearWarmupCosineAnnealingLR - max_epochs: 9 - - config: fanet-is2re-all - note: 'top-1-FA' - frame_averaging: DA + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700539 + note: 'top-3-modif' + frame_averaging: 2D + fa_frames: random model: - mp_type: updownscale + mp_type: updown_local_env phys_embeds: False tag_hidden_channels: 32 pg_hidden_channels: 64 energy_head: weighted-av-final-embeds - complex_mp: False + complex_mp: True graph_norm: True - hidden_channels: 352 - num_filters: 448 - num_gaussians: 99 - num_interactions: 6 + hidden_channels: 300 + num_filters: 480 + num_gaussians: 90 + num_interactions: 5 second_layer_MLP: True skip_co: concat - optim: - lr_initial: 0.0019 + cutoff: 6.0 + optim: + lr_initial: 0.003 scheduler: LinearWarmupCosineAnnealingLR - max_epochs: 9 - - config: fanet-is2re-all - note: 'top-1-FA' - frame_averaging: 3D + eval_every: 0.4 + max_epochs: 18 + + - config: fanet-is2re-all # 2700540 + note: 'top-4-modif' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + phys_embeds: False + tag_hidden_channels: 0 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 384 + num_filters: 384 + num_gaussians: 77 + num_interactions: 4 + second_layer_MLP: True + skip_co: concat-atom + cutoff: 8.0 + optim: + lr_initial: 0.0025 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700541 + note: 'top-4-DA' + frame_averaging: DA + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 384 + num_filters: 384 + num_gaussians: 90 + num_interactions: 4 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.0025 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700542 + note: 'top-5' + frame_averaging: 2D fa_frames: random model: - mp_type: updownscale + mp_type: base + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 32 + energy_head: False + complex_mp: True + graph_norm: True + hidden_channels: 320 + num_filters: 416 + num_gaussians: 36 + num_interactions: 6 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + lr_initial: 0.0034 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700543 + note: 'top-5-modif' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 64 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 350 + num_filters: 416 + num_gaussians: 80 + num_interactions: 6 + second_layer_MLP: False + skip_co: concat-atom + cutoff: 6.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-6' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 480 + num_filters: 352 + num_gaussians: 72 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-01' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 512 + num_filters: 200 + num_gaussians: 150 + num_interactions: 4 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.0023 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-01-modif' + frame_averaging: 2D + fa_frames: DA + model: + mp_type: base + phys_embeds: False + tag_hidden_channels: 32 + pg_hidden_channels: 64 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 512 + num_filters: 250 + num_gaussians: 130 + num_interactions: 4 + second_layer_MLP: True + skip_co: False + cutoff: 6.0 + optim: + lr_initial: 0.0023 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-01-modif' + frame_averaging: 2D + fa_frames: random + model: + mp_type: base phys_embeds: False tag_hidden_channels: 32 pg_hidden_channels: 64 energy_head: weighted-av-final-embeds complex_mp: False graph_norm: True - hidden_channels: 352 - num_filters: 448 - num_gaussians: 99 + hidden_channels: 512 + num_filters: 300 + num_gaussians: 130 + num_interactions: 4 + second_layer_MLP: False + skip_co: concat + cutoff: 6.0 + optim: + lr_initial: 0.0023 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-02-modif' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: base + phys_embeds: False + tag_hidden_channels: 32 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: False + graph_norm: True + hidden_channels: 275 + num_filters: 288 + num_gaussians: 60 num_interactions: 6 - second_layer_MLP: True + second_layer_MLP: False + skip_co: add + cutoff: 6.0 + optim: + lr_initial: 0.0022 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 20 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-02-modif' + frame_averaging: 2D + fa_frames: random + model: + mp_type: base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 32 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 300 + num_filters: 300 + num_gaussians: 70 + num_interactions: 6 + second_layer_MLP: False skip_co: concat - optim: - lr_initial: 0.0019 + cutoff: 6.0 + optim: + lr_initial: 0.0022 scheduler: LinearWarmupCosineAnnealingLR - max_epochs: 9 + max_epochs: 20 + eval_every: 0.4 \ No newline at end of file diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py index 617e54cc18..28c8f55e1f 100644 --- a/scripts/gnn_dev.py +++ b/scripts/gnn_dev.py @@ -25,10 +25,11 @@ config["model"] = {"use_pbc": True} config["model"]["edge_embed_type"] = "all_rij" config["model"]["mp_type"] = "base" - config["model"]["skip_co"] = False + config["model"]["skip_co"] = "concat-atom" # add, concat, config["model"]["att_heads"] = 3 config["model"]["complex_mp"] = True config["model"]["graph_norm"] = True + config["optim"]["eval_every"] = 0.5 # config["model"]["regress_forces"] = "direct_with_gradient_target" checkpoint_path = None From f332cdb015d0fd83bdfc99434abcbe9f1e45b82c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 15:45:03 -0500 Subject: [PATCH 191/273] implement qm7x lse data shift --- configs/exps/qm7x/schnet-fanet-lse.yaml | 113 +++++++++++++++++++ configs/exps/qm7x/schnet-fanet-noenergy.yaml | 108 ++++++++++++++++++ configs/exps/qm7x/schnet-fanet.yaml | 5 + configs/models/qm7x-metadata/lse-shifts.json | 1 + configs/models/tasks/qm7x.yaml | 5 +- ocpmodels/common/utils.py | 12 ++ ocpmodels/datasets/qm7x.py | 23 +++- ocpmodels/trainers/base_trainer.py | 8 -- scripts/compute_qm7x_lse.py | 49 ++++++++ 9 files changed, 312 insertions(+), 12 deletions(-) create mode 100644 configs/exps/qm7x/schnet-fanet-lse.yaml create mode 100644 configs/exps/qm7x/schnet-fanet-noenergy.yaml create mode 100644 configs/models/qm7x-metadata/lse-shifts.json create mode 100644 scripts/compute_qm7x_lse.py diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml new file mode 100644 index 0000000000..97097a6c16 --- /dev/null +++ b/configs/exps/qm7x/schnet-fanet-lse.yaml @@ -0,0 +1,113 @@ +# trainset has 4068193 samples +job: + mem: 32GB + cpus: 4 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 + env: ocp-a100 + +default: + config: schnet-qm7x-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x + phys_hidden_channels: 0 + phys_embeds: False + energy_head: False + pg_hidden_channels: 0 + tag_hidden_channels: 0 + frame_averaging: "" + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions + optim: batch_size, lr_initial + optim: + batch_size: 10 + max_epochs: 100 + warmup_steps: 3000 + lr_initial: 0.0001 + eval_every: 0.34 + # parameters EMA + ema_decay: 0.999 + energy_coefficient: 0. + energy_grad_coefficient: 0 + force_coefficient: 1. + loss_energy: mae + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.5 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + model: + hidden_channels: 128 + num_filters: 128 + num_gaussians: 20 + num_interactions: 6 + cutoff: 5.0 + regress_forces: from_energy + dataset: + train: + rescale_with_hof: False + lse_shift: True + val_id: + lse_shift: True + val_ood: + lse_shift: True + +runs: + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True + + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True + optim: + lr_initial: 0.001 + batch_size: 100 + + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True + optim: + lr_initial: 0.001 + batch_size: 256 + + - config: fanet-qm7x-all + model: + graph_norm: true + edge_embed_type: all_rij + mp_type: updownscale_base + + - config: fanet-qm7x-all + optim: + lr_initial: 0.001 + batch_size: 100 + model: + graph_norm: true + edge_embed_type: all_rij + mp_type: updownscale_base + force_decoder_type: mlp + regress_forces: direct_with_gradient_target + + - config: fanet-qm7x-all + optim: + lr_initial: 0.001 + batch_size: 100 + model: + graph_norm: false + force_decoder_type: mlp + edge_embed_type: all_rij + regress_forces: direct + mp_type: updownscale_base + num_interactions: 4 diff --git a/configs/exps/qm7x/schnet-fanet-noenergy.yaml b/configs/exps/qm7x/schnet-fanet-noenergy.yaml new file mode 100644 index 0000000000..bbe89b7f75 --- /dev/null +++ b/configs/exps/qm7x/schnet-fanet-noenergy.yaml @@ -0,0 +1,108 @@ +# trainset has 4068193 samples +job: + mem: 32GB + cpus: 4 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 + env: ocp-a100 + +default: + config: schnet-qm7x-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x + phys_hidden_channels: 0 + phys_embeds: False + energy_head: False + pg_hidden_channels: 0 + tag_hidden_channels: 0 + frame_averaging: "" + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions + optim: batch_size, lr_initial + optim: + batch_size: 10 + max_epochs: 100 + warmup_steps: 3000 + lr_initial: 0.0001 + eval_every: 0.34 + # parameters EMA + ema_decay: 0.999 + energy_coefficient: 0. + energy_grad_coefficient: 0 + force_coefficient: 1. + loss_energy: mae + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.5 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + model: + hidden_channels: 128 + num_filters: 128 + num_gaussians: 20 + num_interactions: 6 + cutoff: 5.0 + regress_forces: from_energy + dataset: + train: + rescale_with_hof: False + +runs: + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True + + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True + optim: + lr_initial: 0.001 + batch_size: 100 + + - config: schnet-qm7x-all + dataset: + train: + normalize_labels: True + optim: + lr_initial: 0.001 + batch_size: 256 + + - config: fanet-qm7x-all + model: + graph_norm: true + edge_embed_type: all_rij + mp_type: updownscale_base + + - config: fanet-qm7x-all + optim: + lr_initial: 0.001 + batch_size: 100 + model: + graph_norm: true + edge_embed_type: all_rij + mp_type: updownscale_base + force_decoder_type: mlp + regress_forces: direct_with_gradient_target + + - config: fanet-qm7x-all + optim: + lr_initial: 0.001 + batch_size: 100 + model: + graph_norm: false + force_decoder_type: mlp + edge_embed_type: all_rij + regress_forces: direct + mp_type: updownscale_base + num_interactions: 4 diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml index 10c22056b1..fd91abe050 100644 --- a/configs/exps/qm7x/schnet-fanet.yaml +++ b/configs/exps/qm7x/schnet-fanet.yaml @@ -55,6 +55,11 @@ default: dataset: train: rescale_with_hof: True + lse_shift: False + val_id: + lse_shift: False + val_ood: + lse_shift: False runs: - config: schnet-qm7x-all diff --git a/configs/models/qm7x-metadata/lse-shifts.json b/configs/models/qm7x-metadata/lse-shifts.json new file mode 100644 index 0000000000..8893e2a5cc --- /dev/null +++ b/configs/models/qm7x-metadata/lse-shifts.json @@ -0,0 +1 @@ +[0.0, -16.48365429710017, 0.0, 0.0, 0.0, 0.0, -1035.230325647512, -1488.1741712581756, -2045.3532693858685, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -10832.70108036143, -12520.741665730922] \ No newline at end of file diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml index c832e2c82f..3408bf5b8f 100644 --- a/configs/models/tasks/qm7x.yaml +++ b/configs/models/tasks/qm7x.yaml @@ -36,7 +36,8 @@ default: target: ePBE0+MBD forces_target: totFOR std_divider: 1.0 - rescale_with_hof: True + rescale_with_hof: False + lse_shift: True val_id: src: /network/projects/ocp/qm7x/processed normalize_labels: True # mean and std of target will be set by utils.py if this is True @@ -44,6 +45,7 @@ default: target: ePBE0+MBD forces_target: totFOR std_divider: 1.0 + lse_shift: True val_ood: src: /network/projects/ocp/qm7x/processed normalize_labels: True # mean and std of target will be set by utils.py if this is True @@ -51,6 +53,7 @@ default: target: ePBE0+MBD forces_target: totFOR std_divider: 1.0 + lse_shift: True # TEST SET DO NOT ENABLE # - src: /network/projects/ocp/qm9 diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 2c201644f0..2d79d56f06 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -307,6 +307,9 @@ def continue_from_slurm_job_id(config, from_best=False): if config.get("checkpoint"): return config + if config.get("no-resume"): + return config + job_id = os.environ.get("SLURM_JOBID") if job_id is None: return config @@ -536,6 +539,15 @@ def set_qm7x_target_stats(trainer_config): continue if not dataset.get("normalize_labels", False): continue + else: + if dataset.get("lse_shift"): + print( + "Setting normalize_labels to False because of lse_shift for split", + f"{d}.", + ) + trainer_config["dataset"][d]["normalize_labels"] = False + continue + assert "target" in dataset, "target must be specified." mean = target_stats[dataset["target"]]["mean"] std = target_stats[dataset["target"]]["std"] diff --git a/ocpmodels/datasets/qm7x.py b/ocpmodels/datasets/qm7x.py index f98774a60c..97c3b85225 100644 --- a/ocpmodels/datasets/qm7x.py +++ b/ocpmodels/datasets/qm7x.py @@ -710,6 +710,7 @@ def __init__( }, transform=None, ): + self.config = config lmdb_path = Path(config["src"]).expanduser().resolve() self.lmdb_path = str(lmdb_path) if not lmdb_path.exists(): @@ -762,6 +763,20 @@ def __init__( self.hofs[np.isnan(self.hofs)] = self.hofs[~np.isnan(self.hofs)].mean() self.hofs = torch.from_numpy(self.hofs).float() + self.lse_shifts = None + if self.config.get("lse_shift"): + self.lse_shifts = torch.tensor( + json.loads( + ( + ROOT + / "configs" + / "models" + / "qm7x-metadata" + / "lse-shifts.json" + ).read_text() + ) + ) + self.transform = transform def __len__(self): @@ -793,9 +808,11 @@ def __getitem__(self, i): data.natoms = len(data.pos) data.tags = torch.full((data.natoms,), -1, dtype=torch.long) data.atomic_numbers = torch.tensor(data.atNUM, dtype=torch.long) - data.hofs = self.hofs[ - data.atomic_numbers.numpy().astype(int) - 1 # element 1 is at row 0 - ].sum() + data.hofs = self.hofs[data.atomic_numbers - 1].sum() # element 1 is at row 0 + if self.lse_shifts is not None: + data.lse_shift = self.lse_shifts[data.atomic_numbers].sum() + data.y_unshifted = data.y + data.y = data.y - data.lse_shift t1 = time.time_ns() if self.transform is not None: diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index c22ef01139..bbc647ee65 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -13,7 +13,6 @@ from abc import ABC, abstractmethod from collections import defaultdict from copy import deepcopy -from pathlib import Path import numpy as np import torch @@ -341,13 +340,6 @@ def load_task(self): pass def load_model(self): - # Build model - if not self.silent: - print( - f"🧠 Loading model {self.config['model_name']}:\n" - + f" {yaml.dump(self.config['model'])}" - ) - bond_feat_dim = None bond_feat_dim = self.config["model"].get("num_gaussians", 50) diff --git a/scripts/compute_qm7x_lse.py b/scripts/compute_qm7x_lse.py new file mode 100644 index 0000000000..e831f22723 --- /dev/null +++ b/scripts/compute_qm7x_lse.py @@ -0,0 +1,49 @@ +import json +from pathlib import Path +import h5py +from tqdm import tqdm +import numpy as np +from sklearn.feature_extraction import DictVectorizer + + +def count_fn(y): + return dict(zip(*np.unique(y, return_counts=True))) + + +if __name__ == "__main__": + # from SO3Krates + # https://github.com/thorben-frank/mlff/blob/v0.1/mlff/src/data/preprocessing.py#L297 + base = Path("/network/projects/ocp/qm7x/source") + h5_paths = sorted(base.glob("*.hdf5")) + h5s = [h5py.File(p, "r") for p in h5_paths] + data = [ + (h5[f"{mol}/{conf}/ePBE0+MBD"][0], h5[f"{mol}/{conf}/atNUM"][:]) + for i, h5 in enumerate(h5s) + for mol in tqdm(h5, desc=f"Reading file {h5_paths[i].name}", leave=False) + for conf in tqdm(h5[mol], desc=f"Molecule {mol}", leave=False) + ] + + q = np.array([d[0] for d in data]) + max_n_atoms = max([len(d[1]) for d in data]) + z = np.array([np.pad(d[1], (0, max_n_atoms - len(d[1]))) for d in data]) + u = np.unique(z) + idx_ = u != 0 # remove padding with 0 + lhs_counts = list(map(count_fn, z)) + v = DictVectorizer(sparse=False) + X = v.fit_transform(lhs_counts) + X = X[..., idx_] + + sol = np.linalg.lstsq(X, q, rcond=None) + shifts = np.zeros(np.max(u) + 1) + for k, v in dict(zip(u[idx_], sol[0])).items(): + shifts[k] = v + + ( + Path(__file__).resolve().parent.parent + / "configs" + / "models" + / "qm7x-metadata" + / "lse-shifts.json" + ).write_text(json.dumps(shifts.tolist())) + + q_shifts = shifts[z].sum(-1) From f6573a9d15f6f864794446d285c3ddf76fe230e7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 17:26:36 -0500 Subject: [PATCH 192/273] refactor back without runner and split utils --- main.py | 230 ++++++++++++--------------- ocpmodels/common/logger.py | 1 + ocpmodels/common/orion_utils.py | 227 ++++++++++++++++++++++++++ ocpmodels/common/utils.py | 211 +++--------------------- ocpmodels/trainers/base_trainer.py | 2 +- ocpmodels/trainers/single_trainer.py | 4 +- 6 files changed, 355 insertions(+), 320 deletions(-) create mode 100644 ocpmodels/common/orion_utils.py diff --git a/main.py b/main.py index ac749e1d2e..56993e2624 100644 --- a/main.py +++ b/main.py @@ -10,9 +10,8 @@ import os import time import traceback - +import sys import torch -from orion.core.utils.exceptions import ReservationRaceCondition from yaml import dump from ocpmodels.common import dist_utils @@ -20,21 +19,20 @@ from ocpmodels.common.registry import registry from ocpmodels.common.utils import ( JOB_ID, - apply_mult_factor, auto_note, build_config, - continue_from_slurm_job_id, - continue_orion_exp, - load_orion_exp, merge_dicts, move_lmdb_data_to_slurm_tmpdir, - read_slurm_env, resolve, - set_max_fidelity, setup_imports, setup_logging, - unflatten_dict, update_from_sbatch_py_vars, + set_hidden_channels, +) +from ocpmodels.common.orion_utils import ( + continue_orion_exp, + load_orion_exp, + sample_orion_hparams, ) from ocpmodels.trainers import BaseTrainer @@ -56,89 +54,119 @@ def print_warnings(): print("-" * 80 + "\n") -class Runner: - def __init__(self, trainer_config): - self.trainer_config = trainer_config - self.trainer = None - self.hparams = {} +def wrap_up(args, start_time, trainer=None, error=None, signal=None): + + total_time = time.time() - start_time + logging.info(f"Total time taken: {total_time}") + if trainer and trainer.logger is not None: + trainer.logger.log({"Total time": total_time}) + + if args.distributed: + print( + "\nWaiting for all processes to finish with dist_utils.cleanup()...", + end="", + ) + dist_utils.cleanup() + print("Done!") + + if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read(): + print("\nSelf-canceling SLURM job in 32s", JOB_ID) + os.popen(f"sleep 32 && scancel {JOB_ID}") + + if trainer and trainer.logger: + trainer.logger.finish(error or signal) + + +if __name__ == "__main__": + error = signal = orion_exp = orion_trial = None + orion_race_condition = False + hparams = {} + + setup_logging() + + parser = flags.get_parser() + args, override_args = parser.parse_known_args() + args = update_from_sbatch_py_vars(args) + if args.logdir: + args.logdir = resolve(args.logdir) + + trainer_config = build_config(args, override_args) + original_trainer_config = copy.deepcopy(trainer_config) + + if args.distributed: + dist_utils.setup(trainer_config) + print("Distributed backend setup.") + + if dist_utils.is_master(): + trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config) + dist_utils.synchronize() + + trainer_config["dataset"] = dist_utils.broadcast_from_master( + trainer_config["dataset"] + ) + + # -- Initial setup + + setup_imports() + print("All things imported.\n") + start_time = time.time() + + try: + + # -- Orion + + if args.orion_exp_config_path and dist_utils.is_master(): + orion_exp = load_orion_exp(args) - def run(self, orion_exp=None): - orion_trial = signal = None - self.original_config = copy.deepcopy(self.trainer_config) - orion_race_condition = False if dist_utils.is_master(): if orion_exp: - try: - orion_trial = orion_exp.suggest(1) - print( - "\n🚨 Orion reservation race condition detected. Exiting", - "and deleting run dir", - ) - self.hparams = set_max_fidelity( - unflatten_dict( - apply_mult_factor( - orion_trial.params, - self.trainer_config.get("orion_mult_factor"), - sep="/", - ), - sep="/", - ), - orion_exp, - ) - self.hparams["orion_hash_params"] = orion_trial.hash_params - self.hparams["orion_unique_exp_name"] = orion_exp.name - except ReservationRaceCondition: - orion_race_condition = True - import wandb - - if wandb.run is not None: - if wandb.run.tags: - wandb.run.tags = wandb.run.tags + ("RaceCondition",) - else: - wandb.run.tags = ("RaceCondition",) - - self.hparams, orion_race_condition = dist_utils.broadcast_from_master( - self.hparams, orion_race_condition - ) - if self.hparams: + hparams = sample_orion_hparams(orion_exp, trainer_config) + if hparams.get("orion_race_condition"): + logging.warning("\n\n ⛔️ Orion race condition. Stopping here.\n\n") + wrap_up(args, start_time, error, signal) + sys.exit() + + hparams = dist_utils.broadcast_from_master(hparams) + if hparams: print("\n💎 Received hyper-parameters from Orion:") - print(dump(self.hparams), end="\n") + print(dump(hparams), end="\n") + trainer_config = merge_dicts(trainer_config, hparams) + + # -- Setup trainer + + trainer_config = continue_orion_exp(trainer_config) + trainer_config = auto_note(trainer_config) + trainer_config = set_hidden_channels(trainer_config) - self.trainer_config = merge_dicts(self.trainer_config, self.hparams) - self.trainer_config = continue_orion_exp(self.trainer_config) - self.trainer_config = auto_note(self.trainer_config) - cls = registry.get_trainer_class(self.trainer_config["trainer"]) try: - self.trainer: BaseTrainer = cls(**self.trainer_config) + cls = registry.get_trainer_class(trainer_config["trainer"]) + trainer: BaseTrainer = cls(**trainer_config) except Exception as e: - print(f"Error in trainer initialization: {e}") traceback.print_exc() + logging.warning(f"\n💀 Error in trainer initialization: {e}\n") signal = "trainer_init_error" - start_time = time.time() if signal is None: - task = registry.get_task_class(self.trainer_config["mode"])( - self.trainer_config - ) - task.setup(self.trainer) + task = registry.get_task_class(trainer_config["mode"])(trainer_config) + task.setup(trainer) print_warnings() + # -- Start Training + signal = task.run() + # -- End of training + # handle job preemption / time limit if signal == "SIGTERM": print("\nJob was preempted. Wrapping up...\n") - if self.trainer: - self.trainer.close_datasets() + if trainer: + trainer.close_datasets() dist_utils.synchronize() - total_time = time.time() - start_time - logging.info(f"Total time taken: {total_time}") - if self.trainer and self.trainer.logger is not None: - self.trainer.logger.log({"Total time": total_time}) objective = dist_utils.broadcast_from_master( - self.trainer.objective if self.trainer else None + trainer.objective if trainer else None ) if orion_exp is not None: @@ -160,69 +188,9 @@ def run(self, orion_exp=None): [{"type": "objective", "name": "energy_mae", "value": objective}], ) - -if __name__ == "__main__": - runner = error = signal = None - - setup_logging() - - parser = flags.get_parser() - args, override_args = parser.parse_known_args() - args = update_from_sbatch_py_vars(args) - if args.logdir: - args.logdir = resolve(args.logdir) - - trainer_config = build_config(args, override_args) - trainer_config["optim"]["eval_batch_size"] = trainer_config["optim"]["batch_size"] - - original_trainer_config = copy.deepcopy(trainer_config) - - if args.distributed: - dist_utils.setup(trainer_config) - print("Distributed backend setup.") - - if dist_utils.is_master(): - trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config) - # dist_utils.synchronize() - - # ------------------- - # ----- Setup ----- - # ------------------- - setup_imports() - print("All things imported.") - trainer_config = continue_from_slurm_job_id(trainer_config) - trainer_config = read_slurm_env(trainer_config) - runner = Runner(trainer_config) - print("Runner ready.") - - try: - # ------------------- - # ----- Train ----- - # ------------------- - if args.orion_exp_config_path and dist_utils.is_master(): - experiment = load_orion_exp(args) - print("\nStarting runner.") - runner.run(orion_exp=experiment) - else: - print("Starting runner.") - runner.run() - except Exception: error = True print(traceback.format_exc()) finally: - if args.distributed: - print( - "\nWaiting for all processes to finish with dist_utils.cleanup()...", - end="", - ) - dist_utils.cleanup() - print("Done!") - - if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read(): - print("\nSelf-canceling SLURM job in 32s", JOB_ID) - os.popen(f"sleep 32 && scancel {JOB_ID}") - - if runner and runner.trainer and runner.trainer.logger: - runner.trainer.logger.finish(error or signal) + wrap_up(args, start_time, error, signal, trainer=trainer) diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py index b628704fcf..a42eb6eeeb 100644 --- a/ocpmodels/common/logger.py +++ b/ocpmodels/common/logger.py @@ -138,6 +138,7 @@ def __init__(self, trainer_config): if not CLUSTER.drac: self.collect_output_files(policy="live") self.collect_output_files(policy="end") + print(f"\n{'-'*80}\n") def watch(self, model): wandb.watch(model) diff --git a/ocpmodels/common/orion_utils.py b/ocpmodels/common/orion_utils.py new file mode 100644 index 0000000000..7f44cb5683 --- /dev/null +++ b/ocpmodels/common/orion_utils.py @@ -0,0 +1,227 @@ +import copy +import os +import time +from pathlib import Path +from shutil import copyfile, move + +import yaml +from orion.client import build_experiment +from orion.core.utils.exceptions import ReservationRaceCondition + +from ocpmodels.common.utils import ROOT, RUN_DIR, unflatten_dict + + +def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."): + """ + Multiplies all values of orion_hparams listed in mult_factor_dict["targets"] + by mult_factor_dict["value"]. + + eg: + >>> orion_hparams = { + "model/hidden_channels": 4, + "model/num_layers": 4, + "optim/batch_size": 4, + "optim/initial_lr": 0.001, + "frame_averaging": "", + } + + >>> mult_factor_dict = {"value": 32, "targets": "hidden_channels, batch_size"} + + >>> apply_mult_factor(orion_hparams, mult_factor_dict, sep="/") + { + "model/hidden_channels": 128, + "model/num_layers": 4, + "optim/batch_size": 128, + "optim/initial_lr": 0.001, + "frame_averaging": "" + } + + Args: + orion_hparams (_type_): _description_ + mult_factor_dict (_type_): _description_ + sep (str, optional): _description_. Defaults to ".". + + Returns: + _type_: _description_ + """ + if not mult_factor_dict: + return orion_hparams + if not isinstance(mult_factor_dict, dict): + print( + f">>> Warning: ignoring apply_mult_factor, not a dict: {mult_factor_dict}." + ) + if "value" not in mult_factor_dict or "targets" not in mult_factor_dict: + print( + ">>> Warning: ignoring apply_mult_factor, " + + " missing 'value' or 'targets' keys: {}.".format(mult_factor_dict) + ) + value, targets = mult_factor_dict["value"], mult_factor_dict["targets"] + targets = set([t.strip() for t in targets.split(",")]) + updated_hparams = copy.deepcopy(orion_hparams) + for k, v in orion_hparams.items(): + target = k.split(sep)[-1] + if target in targets: + updated_hparams[k] = v * value + return updated_hparams + + +def set_max_fidelity(hparams, orion_exp): + for p, prior in orion_exp.space.items(): + if prior.type == "fidelity": + keys = p.split("/") + if len(keys) == 1: + hparams[f"fidelity_{p}"] = prior.high + elif len(keys) == 2: + if keys[0] not in hparams: + hparams[keys[0]] = {} + hparams[keys[0]][f"fidelity_{keys[1]}"] = prior.high + else: + print("Error: fidelity parameters must be at most 2 levels deep.") + return hparams + + +def sample_orion_hparams(orion_exp, trainer_config): + hparams = {} + try: + orion_trial = orion_exp.suggest(1) + print( + "\n🚨 Orion reservation race condition detected. Exiting", + "and deleting run dir", + ) + hparams = set_max_fidelity( + unflatten_dict( + apply_mult_factor( + orion_trial.params, + trainer_config.get("orion_mult_factor"), + sep="/", + ), + sep="/", + ), + orion_exp, + ) + hparams["orion_hash_params"] = orion_trial.hash_params + hparams["orion_unique_exp_name"] = orion_exp.name + except ReservationRaceCondition: + hparams["orion_race_condition"] = True + import wandb + + if wandb.run is not None: + if wandb.run.tags: + wandb.run.tags = wandb.run.tags + ("RaceCondition",) + else: + wandb.run.tags = ("RaceCondition",) + return hparams + + +def get_and_move_orion_db_path(exp_name): + db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."]) + db_file = f"{db_id}_db.pkl" if not db_id.endswith("_db.pkl") else db_id + scratch_db = RUN_DIR.parent / "orion" / "storage" / db_file + scratch_db.parent.mkdir(parents=True, exist_ok=True) + if not scratch_db.exists(): + home_db = ROOT / f"data/orion/storage/{db_file}" + + if not home_db.exists(): + return scratch_db + + lock_file = home_db.parent / f"{db_file}.cp_lock" + if not lock_file.exists(): + lock_file.touch() + copyfile(home_db, scratch_db) + move(home_db, home_db.parent / f"{db_file}.bak") + os.symlink(str(scratch_db), str(home_db)) + print("Copied and symlinked db from home to scratch.") + lock_file.unlink() + + while lock_file.exists(): + print("Waiting for lock to be released...") + time.sleep(1) + + return scratch_db + + +def load_orion_exp(args): + exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text()) + + assert args.orion_unique_exp_name or exp_config.get( + "unique_exp_name" + ), "Must provide orion_unique_exp_name in the command-line or the config file." + + print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}") + exp_name = args.orion_unique_exp_name or exp_config["unique_exp_name"] + db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."]) + db_path = get_and_move_orion_db_path(db_id) + experiment = build_experiment( + storage={ + "database": { + "host": str(db_path), + "type": "pickleddb", + } + }, + name=exp_name, + space=exp_config["space"], + algorithms=exp_config["algorithms"], + ) + return experiment + + +def continue_orion_exp(trainer_config): + if not trainer_config.get("orion_exp_config_path"): + return trainer_config + + if "orion_hash_params" not in trainer_config: + faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml" + print( + "\n\nWARNING: trainer_config has 'orion_exp_config_path'", + "but no 'orion_hash_params'.", + "This can lead to inconsistencies.", + f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n", + ) + faulty_path.write_text(yaml.dump(trainer_config)) + return trainer_config + + hash_params = trainer_config["orion_hash_params"] + exp_name = trainer_config["orion_unique_exp_name"] + id_file = f"{exp_name}--{hash_params}.unique" + (Path(trainer_config["run_dir"]) / id_file).touch() + base_dir = Path(trainer_config["run_dir"]).parent + existing_id_files = list(base_dir.glob(f"*/{id_file}")) + + latest_dirs = sorted( + [ + f.parent + for f in existing_id_files + if float(f.parent.name) != float(trainer_config["job_id"]) + ], + key=lambda f: float(f.name), + ) + + if not latest_dirs: + print("\n😅 No previous Orion trial matched for unique file: ", id_file) + return trainer_config + + resume_dir = latest_dirs[-1] + + resume_ckpts = sorted( + [f for f in (resume_dir / "checkpoints").glob("checkpoint-*")], + key=lambda f: float(f.stem.split("-")[-1]), + ) + + if not resume_ckpts: + print(f"🥶 Warning: No checkpoint found in {str(resume_dir)}. Not resuming.") + return trainer_config + + trainer_config["checkpoint"] = str(resume_ckpts[-1]) + resume_url = (resume_dir / "wandb_url.txt").read_text().strip() + trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1] + + print( + f"\n🎁 Found {len(resume_ckpts)} existing Orion runs.", + "Resuming from latest:", + str(resume_dir), + "\nOn wandb run:", + resume_url, + ) + print("Based on unique file id:", id_file) + print("Continuing from checkpoint:", trainer_config["checkpoint"], end="\n\n") + return trainer_config diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 2d79d56f06..75ca062e74 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -24,7 +24,6 @@ from functools import wraps from itertools import product from pathlib import Path -from shutil import copyfile, move import numpy as np import torch @@ -32,7 +31,6 @@ import yaml from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas from matplotlib.figure import Figure -from orion.client import build_experiment from torch_geometric.data import Data from torch_geometric.utils import remove_self_loops from torch_scatter import segment_coo, segment_csr @@ -70,189 +68,6 @@ def __getattr__(self, k: str): RUN_DIR = Path(os.environ["SCRATCH"]) / "ocp" / "runs" -def set_max_fidelity(hparams, orion_exp): - for p, prior in orion_exp.space.items(): - if prior.type == "fidelity": - keys = p.split("/") - if len(keys) == 1: - hparams[f"fidelity_{p}"] = prior.high - elif len(keys) == 2: - if keys[0] not in hparams: - hparams[keys[0]] = {} - hparams[keys[0]][f"fidelity_{keys[1]}"] = prior.high - else: - print("Error: fidelity parameters must be at most 2 levels deep.") - return hparams - - -def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."): - """ - Multiplies all values of orion_hparams listed in mult_factor_dict["targets"] - by mult_factor_dict["value"]. - - eg: - >>> orion_hparams = { - "model/hidden_channels": 4, - "model/num_layers": 4, - "optim/batch_size": 4, - "optim/initial_lr": 0.001, - "frame_averaging": "", - } - - >>> mult_factor_dict = {"value": 32, "targets": "hidden_channels, batch_size"} - - >>> apply_mult_factor(orion_hparams, mult_factor_dict, sep="/") - { - "model/hidden_channels": 128, - "model/num_layers": 4, - "optim/batch_size": 128, - "optim/initial_lr": 0.001, - "frame_averaging": "" - } - - Args: - orion_hparams (_type_): _description_ - mult_factor_dict (_type_): _description_ - sep (str, optional): _description_. Defaults to ".". - - Returns: - _type_: _description_ - """ - if not mult_factor_dict: - return orion_hparams - if not isinstance(mult_factor_dict, dict): - print( - f">>> Warning: ignoring apply_mult_factor, not a dict: {mult_factor_dict}." - ) - if "value" not in mult_factor_dict or "targets" not in mult_factor_dict: - print( - ">>> Warning: ignoring apply_mult_factor, " - + " missing 'value' or 'targets' keys: {}.".format(mult_factor_dict) - ) - value, targets = mult_factor_dict["value"], mult_factor_dict["targets"] - targets = set([t.strip() for t in targets.split(",")]) - updated_hparams = copy.deepcopy(orion_hparams) - for k, v in orion_hparams.items(): - target = k.split(sep)[-1] - if target in targets: - updated_hparams[k] = v * value - return updated_hparams - - -def get_and_move_orion_db_path(exp_name): - db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."]) - db_file = f"{db_id}_db.pkl" if not db_id.endswith("_db.pkl") else db_id - scratch_db = RUN_DIR.parent / "orion" / "storage" / db_file - scratch_db.parent.mkdir(parents=True, exist_ok=True) - if not scratch_db.exists(): - home_db = ROOT / f"data/orion/storage/{db_file}" - - if not home_db.exists(): - return scratch_db - - lock_file = home_db.parent / f"{db_file}.cp_lock" - if not lock_file.exists(): - lock_file.touch() - copyfile(home_db, scratch_db) - move(home_db, home_db.parent / f"{db_file}.bak") - os.symlink(str(scratch_db), str(home_db)) - print("Copied and symlinked db from home to scratch.") - lock_file.unlink() - - while lock_file.exists(): - print("Waiting for lock to be released...") - time.sleep(1) - - return scratch_db - - -def load_orion_exp(args): - exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text()) - - assert args.orion_unique_exp_name or exp_config.get( - "unique_exp_name" - ), "Must provide orion_unique_exp_name in the command-line or the config file." - - print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}") - exp_name = args.orion_unique_exp_name or exp_config["unique_exp_name"] - db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."]) - db_path = get_and_move_orion_db_path(db_id) - experiment = build_experiment( - storage={ - "database": { - "host": str(db_path), - "type": "pickleddb", - } - }, - name=exp_name, - space=exp_config["space"], - algorithms=exp_config["algorithms"], - ) - return experiment - - -def continue_orion_exp(trainer_config): - if not trainer_config.get("orion_exp_config_path"): - return trainer_config - - if "orion_hash_params" not in trainer_config: - faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml" - print( - "\n\nWARNING: trainer_config has 'orion_exp_config_path'", - "but no 'orion_hash_params'.", - "This can lead to inconsistencies.", - f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n", - ) - faulty_path.write_text(yaml.dump(trainer_config)) - return trainer_config - - hash_params = trainer_config["orion_hash_params"] - exp_name = trainer_config["orion_unique_exp_name"] - id_file = f"{exp_name}--{hash_params}.unique" - (Path(trainer_config["run_dir"]) / id_file).touch() - base_dir = Path(trainer_config["run_dir"]).parent - existing_id_files = list(base_dir.glob(f"*/{id_file}")) - - latest_dirs = sorted( - [ - f.parent - for f in existing_id_files - if float(f.parent.name) != float(trainer_config["job_id"]) - ], - key=lambda f: float(f.name), - ) - - if not latest_dirs: - print("\n😅 No previous Orion trial matched for unique file: ", id_file) - return trainer_config - - resume_dir = latest_dirs[-1] - - resume_ckpts = sorted( - [f for f in (resume_dir / "checkpoints").glob("checkpoint-*")], - key=lambda f: float(f.stem.split("-")[-1]), - ) - - if not resume_ckpts: - print(f"🥶 Warning: No checkpoint found in {str(resume_dir)}. Not resuming.") - return trainer_config - - trainer_config["checkpoint"] = str(resume_ckpts[-1]) - resume_url = (resume_dir / "wandb_url.txt").read_text().strip() - trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1] - - print( - f"\n🎁 Found {len(resume_ckpts)} existing Orion runs.", - "Resuming from latest:", - str(resume_dir), - "\nOn wandb run:", - resume_url, - ) - print("Based on unique file id:", id_file) - print("Continuing from checkpoint:", trainer_config["checkpoint"], end="\n\n") - return trainer_config - - def read_slurm_env(config): """ Parses the output of `scontrol show` in order to store the slurm @@ -1089,6 +904,29 @@ def check_regress_forces(config): ) +def set_hidden_channels(config): + # Embedding( + # 85, + # hidden_channels + # - tag_hidden_channels + # - phys_hidden_channels + # - 2 * pg_hidden_channels, + # ) + hc = config["model"].get("hidden_channels", 0) + thc = config["model"].get("tag_hidden_channels", 0) + phc = config["model"].get("phys_hidden_channels", 0) * int( + config["model"].get("phys_embeds", 0) + ) + pghc = config["model"].get("pg_hidden_channels", 0) + + if hc - thc - phc - 2 * pghc < 0: + hc = thc + phc + 2 * pghc + 32 + print(f"WARNING: hidden_channels is too small. Setting it to {hc}") + config["model"]["hidden_channels"] = hc + + return config + + def load_config(config_str): model, task, split = config_str.split("-") conf_path = ROOT / "configs" / "models" @@ -1194,6 +1032,9 @@ def build_config(args, args_override): config = set_qm9_target_stats(config) config = set_qm7x_target_stats(config) config = override_drac_paths(config) + config = continue_from_slurm_job_id(config) + config = read_slurm_env(config) + config["optim"]["eval_batch_size"] = config["optim"]["batch_size"] return config diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index bbc647ee65..9963a299cb 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -145,7 +145,7 @@ def __init__(self, **kwargs): self.hpo_checkpoint_every = self.config["optim"].get("checkpoint_every", -1) if dist_utils.is_master() and not self.silent: - print("🧰 Trainer config:") + print(f"\n🧰 Trainer config:\n{'-'*17}\n") print(yaml.dump(self.config), end="\n\n") self.load() diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 74c8217681..bdb5a8908f 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -47,8 +47,6 @@ def now(self): return str(datetime.datetime.now()).split(".")[0] def load_task(self): - if not self.silent: - print(f"Loading dataset: {self.config['task']['dataset']}") self.num_targets = 1 # start imports from @@ -220,7 +218,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): model_run_time = 0 if not self.silent: - print(f"\n--- 🔄 Beginning of Training @ {self.now}---\n") + print(f"\n--- 🔄 Beginning of Training @ {self.now} ---\n") print(f"\nLogging train metrics every {log_train_every} steps") print(f"Printing train metrics every {self.config['print_every']} steps") print(f"\nEvaluating every {eval_every} steps\n") From 7aa4fb6a53fc364c262e9f7273ec23fba07cd276 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 17:28:41 -0500 Subject: [PATCH 193/273] fix prints --- main.py | 4 ++-- ocpmodels/trainers/base_trainer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 56993e2624..467b001694 100644 --- a/main.py +++ b/main.py @@ -46,12 +46,12 @@ def print_warnings(): "`tag_specific_weights` is not handled for " + "`regress_forces: direct_with_gradient_target` in compute_loss()", ] - print("\n" + "-" * 80) + print("\n" + "-" * 80 + "\n") print("🛑 OCP-DR-Lab Warnings (nota benes):") for warning in warnings: print(f" • {warning}") print("Remove warnings when they are fixed in the code/configs.") - print("-" * 80 + "\n") + print("\n" + "-" * 80 + "\n") def wrap_up(args, start_time, trainer=None, error=None, signal=None): diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 9963a299cb..10c252ed08 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -145,7 +145,7 @@ def __init__(self, **kwargs): self.hpo_checkpoint_every = self.config["optim"].get("checkpoint_every", -1) if dist_utils.is_master() and not self.silent: - print(f"\n🧰 Trainer config:\n{'-'*17}\n") + print(f"\n🧰 Trainer config:\n{'-'*18}\n") print(yaml.dump(self.config), end="\n\n") self.load() From 14ea1f3858291c92cb8ccdaf014bf9d5c4976815 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 17:34:18 -0500 Subject: [PATCH 194/273] add `keep_orion_config` flag --- main.py | 2 ++ ocpmodels/common/flags.py | 6 ++++++ ocpmodels/common/utils.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/main.py b/main.py index 467b001694..4f2088f6e4 100644 --- a/main.py +++ b/main.py @@ -90,6 +90,8 @@ def wrap_up(args, start_time, trainer=None, error=None, signal=None): if args.logdir: args.logdir = resolve(args.logdir) + # -- Build config + trainer_config = build_config(args, override_args) original_trainer_config = copy.deepcopy(trainer_config) diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 2a8caa3e91..6b35f7711a 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -95,6 +95,12 @@ def add_core_args(self): help="Run to restart, loading its config and overwriting " + "from the command-line", ) + self.parser.add_argument( + "--keep_orion_config", + type=bool, + help="If not True, any key in the continued/restarted config that contains" + + " ``orion`` will be set to ``None``", + ) self.parser.add_argument( "--timestamp-id", default=None, diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 75ca062e74..1d813441a5 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -990,6 +990,10 @@ def build_config(args, args_override): if args.continue_from_dir: continue_config["checkpoint"] = str(latest_ckpt) continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] + if not args.keep_orion_config: + for k in continue_config: + if "orion" in k: + continue_config[k] = None print( f"✅ Loading config from directory {str(cont_dir)}" + ( From 676929b449aa3abf0d61b3dd8f999bbf18da1214 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 17:42:47 -0500 Subject: [PATCH 195/273] fix normalizers for hof --- configs/exps/qm7x/schnet-fanet-lse.yaml | 9 --------- ocpmodels/common/utils.py | 2 +- ocpmodels/modules/normalizer.py | 20 +++++++++++--------- ocpmodels/trainers/base_trainer.py | 4 ++-- 4 files changed, 14 insertions(+), 21 deletions(-) diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml index 97097a6c16..b101084196 100644 --- a/configs/exps/qm7x/schnet-fanet-lse.yaml +++ b/configs/exps/qm7x/schnet-fanet-lse.yaml @@ -63,22 +63,13 @@ default: runs: - config: schnet-qm7x-all - dataset: - train: - normalize_labels: True - config: schnet-qm7x-all - dataset: - train: - normalize_labels: True optim: lr_initial: 0.001 batch_size: 100 - config: schnet-qm7x-all - dataset: - train: - normalize_labels: True optim: lr_initial: 0.001 batch_size: 256 diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 1d813441a5..f3ea8c23ad 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -380,7 +380,7 @@ def set_qm7x_target_stats(trainer_config): if "train" in trainer_config["dataset"] and trainer_config["dataset"]["train"].get( "rescale_with_hof" ): - trainer_config["dataset"]["train"]["hof_rescales"] = hof_stats + trainer_config["dataset"]["train"]["hof_stats"] = hof_stats return trainer_config diff --git a/ocpmodels/modules/normalizer.py b/ocpmodels/modules/normalizer.py index df2830e276..f70c6dcf17 100644 --- a/ocpmodels/modules/normalizer.py +++ b/ocpmodels/modules/normalizer.py @@ -32,6 +32,7 @@ def __init__(self, tensor=None, mean=None, std=None, device=None): self.hof_mean = None self.hof_std = None + self.rescale_with_hof = False def to(self, device): self.mean = self.mean.to(device) @@ -43,19 +44,19 @@ def to(self, device): self.device = device def norm(self, tensor, hofs=None): - if hofs is not None: + if hofs is not None and self.rescale_with_hof: return tensor / hofs - self.hof_mean return (tensor - self.mean) / self.std def denorm(self, normed_tensor, hofs=None): - if hofs is not None: + if hofs is not None and self.rescale_with_hof: return (normed_tensor + self.hof_mean) * hofs return normed_tensor * self.std + self.mean def state_dict(self): sd = {"mean": self.mean, "std": self.std} - if self.hof_mean is not None: - sd["hof_rescales"] = { + if self.rescale_with_hof: + sd["hof_stats"] = { "mean": self.hof_mean, "std": self.hof_std, } @@ -64,9 +65,10 @@ def state_dict(self): def load_state_dict(self, state_dict): self.mean = state_dict["mean"].to(self.mean.device) self.std = state_dict["std"].to(self.mean.device) - if "hof_rescales" in state_dict: - self.set_hof_rescales(state_dict["hof_rescales"]) + if "hof_stats" in state_dict: + self.set_hof_rescales(state_dict["hof_stats"]) - def set_hof_rescales(self, hof_rescales): - self.hof_mean = torch.tensor(hof_rescales["mean"], device=self.device) - self.hof_std = torch.tensor(hof_rescales["std"], device=self.device) + def set_hof_rescales(self, hof_stats): + self.hof_mean = torch.tensor(hof_stats["mean"], device=self.device) + self.hof_std = torch.tensor(hof_stats["std"], device=self.device) + self.rescale_with_hof = True diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 10c252ed08..901908345a 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -319,9 +319,9 @@ def load_datasets(self): std=self.normalizer["target_std"], device=self.device, ) - if "hof_rescales" in self.normalizer: + if "hof_stats" in self.normalizer: self.normalizers["target"].set_hof_rescales( - self.normalizer["hof_rescales"] + self.normalizer["hof_stats"] ) else: self.normalizers["target"] = Normalizer( From 6798ecded51f62e5fe3b1890cfca518afa378c2e Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 18:32:12 -0500 Subject: [PATCH 196/273] + set_hidden_channels --- main.py | 4 ++-- ocpmodels/common/utils.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 4f2088f6e4..1290ca79f4 100644 --- a/main.py +++ b/main.py @@ -54,7 +54,7 @@ def print_warnings(): print("\n" + "-" * 80 + "\n") -def wrap_up(args, start_time, trainer=None, error=None, signal=None): +def wrap_up(args, start_time, error=None, signal=None, trainer=None): total_time = time.time() - start_time logging.info(f"Total time taken: {total_time}") @@ -78,7 +78,7 @@ def wrap_up(args, start_time, trainer=None, error=None, signal=None): if __name__ == "__main__": - error = signal = orion_exp = orion_trial = None + error = signal = orion_exp = orion_trial = trainer = None orion_race_condition = False hparams = {} diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index f3ea8c23ad..b344c639ff 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -914,9 +914,8 @@ def set_hidden_channels(config): # ) hc = config["model"].get("hidden_channels", 0) thc = config["model"].get("tag_hidden_channels", 0) - phc = config["model"].get("phys_hidden_channels", 0) * int( - config["model"].get("phys_embeds", 0) - ) + phc = config["model"].get("phys_hidden_channels", 0) or 14 + phc *= int(config["model"].get("phys_embeds", 0)) pghc = config["model"].get("pg_hidden_channels", 0) if hc - thc - phc - 2 * pghc < 0: From 0f276bad75f22bdbec5f22d718689af62ebf786e Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 18:32:22 -0500 Subject: [PATCH 197/273] qm9 orion v5 --- configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml new file mode 100644 index 0000000000..b52a2003aa --- /dev/null +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml @@ -0,0 +1,73 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 8GB + cpus: 4 + gres: gpu:1 + time: 02:50:00 + partition: long + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, orion + log_train_every: 200 + optim: + batch_size: 32 + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + loss_energy: mse + # early stopping + es_patience: 20 + es_min_abs_change: 0.000001 + es_warmup_epochs: 800 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 15 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: num_filters, pg_hidden_channels, num_gaussians + frame_averaging: 3D + fa_frames: random + model: + edge_embed_type: all_rij + energy_head: "" + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 20 + + unique_exp_name: fanet-qm9-v5.0.0 + + space: + optim/max_epochs: fidelity(200, 2000, base=5) + optim/lr_initial: loguniform(1e-4, 6e-4, precision=3) + model/graph_norm: choices([True, False]) + model/skip_co: choices([True, False]) + model/second_layer_mlp: choices([True, False]) + model/hidden_channels: uniform(6, 15, discrete=True) + model/mp_type: choices(["simple", "base", "updownscale", "updownscale_base"]) + model/num_filters: uniform(4, 16, discrete=True) + model/num_gaussians: uniform(1, 4, discrete=True) + model/num_interactions: uniform(3, 5, discrete=True) + model/pg_hidden_channels: uniform(0, 1, discrete=True) + model/phys_embeds: choices([True, False]) + model/max_num_neighbours: choices([30, 40, 50]) + model/cutoff: uniform(4, 6, precision=1) + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 From b83da002f17d3bd21eae6098063665019f671a8d Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 18:34:59 -0500 Subject: [PATCH 198/273] add = in sbatch rundir and logdir --- sbatch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sbatch.py b/sbatch.py index 0d6affdfbe..bf17bd5c6f 100644 --- a/sbatch.py +++ b/sbatch.py @@ -260,10 +260,10 @@ def write_orion_config(args, outdir): # add logdir to main.py's command-line arguments if "--logdir" not in args.py_args and args.logdir: - args.py_args += f" --logdir {args.logdir}" + args.py_args += f" --logdir={args.logdir}" # add run-dir to main.py's command-line arguments if "--run-dir" not in args.py_args and args.logdir: - args.py_args += f" --run-dir {args.logdir}" + args.py_args += f" --run-dir={args.logdir}" if "--note" not in args.py_args and args.note: note = args.note.replace('"', '\\"') From 4b7b9a2a21e508579b2cbba1c1f59adbc8e84a64 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 18:43:33 -0500 Subject: [PATCH 199/273] add job_name from exp_name for slurm --- launch_exp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/launch_exp.py b/launch_exp.py index 8ab6d86867..665cbba6bd 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -197,6 +197,7 @@ def get_args_or_exp(key, args, exp): runs = [ { "orion_exp_config_path": str(search_path), + "job_name": unique_exp_name, } for _ in range(n_jobs) ] From 8e8b3729a88378f498db2e9ef66bb6bf74a0ffe4 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 18:45:39 -0500 Subject: [PATCH 200/273] auto job_name from exp_name if no orion_unique_exp_name --- launch_exp.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/launch_exp.py b/launch_exp.py index 665cbba6bd..dd7d4110ad 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -197,7 +197,9 @@ def get_args_or_exp(key, args, exp): runs = [ { "orion_exp_config_path": str(search_path), - "job_name": unique_exp_name, + "job": { + "job_name": unique_exp_name, + }, } for _ in range(n_jobs) ] @@ -215,6 +217,9 @@ def get_args_or_exp(key, args, exp): if "time" in job: job["time"] = seconds_to_time_str(job["time"]) + if "job_name" not in job: + job["job_name"] = exp_name + if "wandb_tags" in params: params["wandb_tags"] += "," + exp_name else: From 55b738a4be85c697351665ef99565142de1b7ee6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 20:20:50 -0500 Subject: [PATCH 201/273] shift None observation print --- main.py | 6 +++--- ocpmodels/common/orion_utils.py | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index 1290ca79f4..c7c5f30d45 100644 --- a/main.py +++ b/main.py @@ -122,7 +122,7 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None): if dist_utils.is_master(): if orion_exp: - hparams = sample_orion_hparams(orion_exp, trainer_config) + hparams, orion_trial = sample_orion_hparams(orion_exp, trainer_config) if hparams.get("orion_race_condition"): logging.warning("\n\n ⛔️ Orion race condition. Stopping here.\n\n") wrap_up(args, start_time, error, signal) @@ -182,13 +182,13 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None): "Received trainer_init_error from worker.", "Setting objective to 1e12.", ) - else: - print("Received None objective from worker. Skipping observation.") if objective is not None: orion_exp.observe( orion_trial, [{"type": "objective", "name": "energy_mae", "value": objective}], ) + else: + print("Received None objective from worker. Skipping observation.") except Exception: error = True diff --git a/ocpmodels/common/orion_utils.py b/ocpmodels/common/orion_utils.py index 7f44cb5683..24a2f0a52e 100644 --- a/ocpmodels/common/orion_utils.py +++ b/ocpmodels/common/orion_utils.py @@ -82,6 +82,7 @@ def set_max_fidelity(hparams, orion_exp): def sample_orion_hparams(orion_exp, trainer_config): hparams = {} + orion_trial = None try: orion_trial = orion_exp.suggest(1) print( @@ -110,7 +111,7 @@ def sample_orion_hparams(orion_exp, trainer_config): wandb.run.tags = wandb.run.tags + ("RaceCondition",) else: wandb.run.tags = ("RaceCondition",) - return hparams + return hparams, orion_trial def get_and_move_orion_db_path(exp_name): From 2f64dbdbc458cd45f507b51346fcebbd0bef9bc7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 20:43:08 -0500 Subject: [PATCH 202/273] override max_epochs or steps from cli --- main.py | 17 +++++++---------- ocpmodels/common/flags.py | 2 +- ocpmodels/common/utils.py | 38 ++++++++++++++++++++++++++++++++------ 3 files changed, 40 insertions(+), 17 deletions(-) diff --git a/main.py b/main.py index c7c5f30d45..01fc6c3c6d 100644 --- a/main.py +++ b/main.py @@ -27,7 +27,7 @@ setup_imports, setup_logging, update_from_sbatch_py_vars, - set_hidden_channels, + set_min_hidden_channels, ) from ocpmodels.common.orion_utils import ( continue_orion_exp, @@ -93,7 +93,6 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None): # -- Build config trainer_config = build_config(args, override_args) - original_trainer_config = copy.deepcopy(trainer_config) if args.distributed: dist_utils.setup(trainer_config) @@ -119,14 +118,12 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None): if args.orion_exp_config_path and dist_utils.is_master(): orion_exp = load_orion_exp(args) + hparams, orion_trial = sample_orion_hparams(orion_exp, trainer_config) - if dist_utils.is_master(): - if orion_exp: - hparams, orion_trial = sample_orion_hparams(orion_exp, trainer_config) - if hparams.get("orion_race_condition"): - logging.warning("\n\n ⛔️ Orion race condition. Stopping here.\n\n") - wrap_up(args, start_time, error, signal) - sys.exit() + if hparams.get("orion_race_condition"): + logging.warning("\n\n ⛔️ Orion race condition. Stopping here.\n\n") + wrap_up(args, start_time, error, signal) + sys.exit() hparams = dist_utils.broadcast_from_master(hparams) if hparams: @@ -138,7 +135,7 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None): trainer_config = continue_orion_exp(trainer_config) trainer_config = auto_note(trainer_config) - trainer_config = set_hidden_channels(trainer_config) + trainer_config = set_min_hidden_channels(trainer_config) try: cls = registry.get_trainer_class(trainer_config["trainer"]) diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py index 6b35f7711a..1dd7e83701 100644 --- a/ocpmodels/common/flags.py +++ b/ocpmodels/common/flags.py @@ -99,7 +99,7 @@ def add_core_args(self): "--keep_orion_config", type=bool, help="If not True, any key in the continued/restarted config that contains" - + " ``orion`` will be set to ``None``", + + " ``orion`` or ``fidelity`` will be set to ``None``", ) self.parser.add_argument( "--timestamp-id", diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index b344c639ff..8b3ca97fa5 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -904,7 +904,7 @@ def check_regress_forces(config): ) -def set_hidden_channels(config): +def set_min_hidden_channels(config): # Embedding( # 85, # hidden_channels @@ -990,9 +990,15 @@ def build_config(args, args_override): continue_config["checkpoint"] = str(latest_ckpt) continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] if not args.keep_orion_config: + dels = {} for k in continue_config: - if "orion" in k: + if "orion" in k or "fidelity" in k: + dels[k] = copy.deepcopy(continue_config[k]) continue_config[k] = None + print( + "Removing orion config from continue config. Set to None:", + "{" + ", ".join([f"{k}: {v}->None" for k, v in dels.items()]) + "}", + ) print( f"✅ Loading config from directory {str(cont_dir)}" + ( @@ -1021,14 +1027,34 @@ def build_config(args, args_override): if continue_config: new_dirs = [(k, v) for k, v in config.items() if "dir" in k] - # dataset_config = copy.deepcopy(config["dataset"]) config = merge_dicts( continue_config, {k: resolve(v) if isinstance(v, str) else v for k, v in new_dirs}, ) - # config["dataset"] = dataset_config - config = merge_dicts(config, cli_args_dict()) - config = merge_dicts(config, overrides) + cli = cli_args_dict() + if "max_steps" in cli.get("optim", {}): + if "max_epochs" in cli.get("optim", {}): + print( + "Cannot set both `max_steps` and `max_epochs` from CLI.", + " Using `max_steps`.", + ) + del cli["optim"]["max_epochs"] + if "max_epochs" in config["optim"]: + print( + f"Deleting max_epochs ({config['optim']['max_epochs']})", + " because of `max_steps` from CLI.", + "It will be reset by the Trainer.", + ) + del config["optim"]["max_epochs"] + elif "max_epochs" in cli.get("optim", {}): + if "max_steps" in config["optim"]: + print( + f"Deleting max_steps ({config['optim']['max_steps']})", + " because of `max_epochs` from CLI.", + "It will be reset by the Trainer.", + ) + del config["optim"]["max_steps"] + config = merge_dicts(config, cli) check_regress_forces(config) config = set_cpus_to_workers(config) From 15e1d5f1f1d84765ff1b9d0452a45795b303bd65 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Thu, 19 Jan 2023 20:49:20 -0500 Subject: [PATCH 203/273] add no confirm flag --- launch_exp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launch_exp.py b/launch_exp.py index dd7d4110ad..7fa87240b8 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -243,7 +243,7 @@ def get_args_or_exp(key, args, exp): text += "\n<><><> Experiment config:\n\n-----" + exp_file.read_text() + "-----" text += "\n<><><> Experiment runs:\n\n • " + "\n\n • ".join(commands) + separator - confirm = input("\n🚦 Confirm? [y/n] : ") + confirm = args.no_confirm or input("\n🚦 Confirm? [y/n] : ") if confirm == "y": try: From 08d5e5e61b419945173ae09ca2b3854266f6dcef Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 10:21:06 -0500 Subject: [PATCH 204/273] fix qm coefs --- configs/exps/qm7x/schnet-fanet-lse.yaml | 55 +++++++++++++++++++------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml index b101084196..8b6000ea9b 100644 --- a/configs/exps/qm7x/schnet-fanet-lse.yaml +++ b/configs/exps/qm7x/schnet-fanet-lse.yaml @@ -29,12 +29,12 @@ default: max_epochs: 100 warmup_steps: 3000 lr_initial: 0.0001 - eval_every: 0.34 + eval_every: 0.251 + energy_coefficient: 0.01 + energy_grad_coefficient: 0 + force_coefficient: 0.99 # parameters EMA ema_decay: 0.999 - energy_coefficient: 0. - energy_grad_coefficient: 0 - force_coefficient: 1. loss_energy: mae loss_force: mse # all below is for the scheduler @@ -66,14 +66,8 @@ runs: - config: schnet-qm7x-all optim: - lr_initial: 0.001 batch_size: 100 - - config: schnet-qm7x-all - optim: - lr_initial: 0.001 - batch_size: 256 - - config: fanet-qm7x-all model: graph_norm: true @@ -82,9 +76,13 @@ runs: - config: fanet-qm7x-all optim: - lr_initial: 0.001 batch_size: 100 + energy_coefficient: 0.01 + energy_grad_coefficient: 0.1 + force_coefficient: 0.89 model: + hidden_channels: 256 + num_filters: 256 graph_norm: true edge_embed_type: all_rij mp_type: updownscale_base @@ -93,8 +91,40 @@ runs: - config: fanet-qm7x-all optim: - lr_initial: 0.001 batch_size: 100 + energy_coefficient: 0.01 + energy_grad_coefficient: 0 + force_coefficient: 0.99 + model: + graph_norm: false + force_decoder_type: mlp + edge_embed_type: all_rij + regress_forces: direct + mp_type: updownscale_base + num_interactions: 4 + regress_forces: direct + + - config: fanet-qm7x-all + optim: + batch_size: 100 + energy_coefficient: 0.01 + energy_grad_coefficient: 0 + force_coefficient: 0.99 + model: + hidden_channels: 256 + num_filters: 256 + graph_norm: true + edge_embed_type: all_rij + mp_type: updownscale_base + force_decoder_type: mlp + regress_forces: from_energy + + - config: fanet-qm7x-all + optim: + batch_size: 100 + energy_coefficient: 1 + energy_grad_coefficient: 0 + force_coefficient: 0 model: graph_norm: false force_decoder_type: mlp @@ -102,3 +132,4 @@ runs: regress_forces: direct mp_type: updownscale_base num_interactions: 4 + regress_forces: "" From 1733dc3860545cd7fc9a5937121f48a27d63a81f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 16:26:27 -0500 Subject: [PATCH 205/273] qm9 lse --- .../qm9-metadata/lse-shifts-pre-attr.json | 230 ++++++++++++++++++ configs/models/tasks/qm9.yaml | 3 + ocpmodels/datasets/qm9.py | 23 +- scripts/compute_qm9_lse.py | 44 ++++ 4 files changed, 299 insertions(+), 1 deletion(-) create mode 100644 configs/models/qm9-metadata/lse-shifts-pre-attr.json create mode 100644 scripts/compute_qm9_lse.py diff --git a/configs/models/qm9-metadata/lse-shifts-pre-attr.json b/configs/models/qm9-metadata/lse-shifts-pre-attr.json new file mode 100644 index 0000000000..86f4af4829 --- /dev/null +++ b/configs/models/qm9-metadata/lse-shifts-pre-attr.json @@ -0,0 +1,230 @@ +[ + [ + 0.0, + -0.10982761652979106, + 0.0, + 0.0, + 0.0, + 0.0, + 0.32071860969393706, + 0.7727475754212988, + 0.6122788803796335, + 0.12181916029627653 + ], + [ + 0.0, + 1.0966529080621437, + 0.0, + 0.0, + 0.0, + 0.0, + 8.345193465915363, + 6.7224998615447875, + 3.732818145439648, + 2.2936289582074605 + ], + [ + 0.0, + 0.0077456863520091205, + 0.0, + 0.0, + 0.0, + 0.0, + -0.7250601459571974, + -0.6989512318127282, + -0.8863108915238633, + -0.9922779353470085 + ], + [ + 0.0, + 0.30021842422367884, + 0.0, + 0.0, + 0.0, + 0.0, + -0.26317262369533545, + -0.2875368455127921, + -0.3500397347350114, + 0.04404252090948248 + ], + [ + 0.0, + 0.2924707296899182, + 0.0, + 0.0, + 0.0, + 0.0, + 0.46189245571758497, + 0.4114099985995442, + 0.5362623401609063, + 1.036313993269014 + ], + [ + 0.0, + 47.45816316610576, + 0.0, + 0.0, + 0.0, + 0.0, + 70.0750144837597, + 124.4791335864648, + 127.86854197433897, + 138.72469086321337 + ], + [ + 0.0, + 0.3121385900756226, + 0.0, + 0.0, + 0.0, + 0.0, + 0.1369104053751863, + 0.1395331622284403, + 0.11435811860274497, + 0.0927858357374155 + ], + [ + 0.0, + -16.42979788627529, + 0.0, + 0.0, + 0.0, + 0.0, + -1036.0476484603748, + -1489.8018410118, + -2046.9839395287415, + -2717.500731519501 + ], + [ + 0.0, + -16.41971726970345, + 0.0, + 0.0, + 0.0, + 0.0, + -1036.0357739433853, + -1489.7785840232264, + -2046.957058538375, + -2717.4729766865266 + ], + [ + 0.0, + -16.41966403449024, + 0.0, + 0.0, + 0.0, + 0.0, + -1036.032953979038, + -1489.7756271261542, + -2046.9541183747576, + -2717.4699865136436 + ], + [ + 0.0, + -16.442885376380016, + 0.0, + 0.0, + 0.0, + 0.0, + -1036.1303574932683, + -1489.905244199318, + -2047.0931070097215, + -2717.611776049197 + ], + [ + 0.0, + 1.2395997052342684, + 0.0, + 0.0, + 0.0, + 0.0, + 2.0381439844712617, + 2.786144517849412, + 3.081733026508292, + 3.3493991981514926 + ], + [ + 0.0, + -2.816675503921841, + 0.0, + 0.0, + 0.0, + 0.0, + -6.184539526467787, + -4.499344570456945, + -4.372725898402976, + -4.015912118288816 + ], + [ + 0.0, + -2.8451268608519213, + 0.0, + 0.0, + 0.0, + 0.0, + -6.211222454315244, + -4.514619577859387, + -4.384377408978144, + -4.026689035403237 + ], + [ + 0.0, + -2.8707881825750228, + 0.0, + 0.0, + 0.0, + 0.0, + -6.234090510252649, + -4.537349782429693, + -4.40712339763125, + -4.049381712818243 + ], + [ + 0.0, + -2.539854050015558, + 0.0, + 0.0, + 0.0, + 0.0, + -5.871457077318331, + -4.193597910926504, + -4.075001860101352, + -3.723844087894856 + ], + [ + 0.0, + -12.07141210715022, + 0.0, + 0.0, + 0.0, + 0.0, + 21.450944711624825, + -6.003425192133964, + -7.458598998696279, + -12.120394582901548 + ], + [ + 0.0, + -0.03777062245051535, + 0.0, + 0.0, + 0.0, + 0.0, + 0.21317817854549387, + 0.1600228078641471, + 0.1495886265915201, + 0.1495129174010618 + ], + [ + 0.0, + -0.023858095381470663, + 0.0, + 0.0, + 0.0, + 0.0, + 0.16957282642751406, + 0.09736248987285331, + 0.10686749266146903, + 0.08566440464961594 + ] +] \ No newline at end of file diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml index e53c071188..42b4256a1e 100644 --- a/configs/models/tasks/qm9.yaml +++ b/configs/models/tasks/qm9.yaml @@ -30,6 +30,7 @@ default: target: 12 # predict atomization energy at 0K at index 12 seed: 123 normalize_labels: True # mean and std of target will be set by qm9.py if this is True + lse_shift: false indices: start: 0 end: 110000 @@ -37,6 +38,7 @@ default: src: /network/projects/ocp/qm9 target: 12 # predict atomization energy at 0K at index 12 seed: 123 + lse_shift: false indices: start: 110000 end: 120000 @@ -44,6 +46,7 @@ default: src: /network/projects/ocp/qm9 target: 12 # predict atomization energy at 0K at index 12 seed: 123 + lse_shift: false indices: start: 120000 end: -1 diff --git a/ocpmodels/datasets/qm9.py b/ocpmodels/datasets/qm9.py index 3a5930569b..c0882761a1 100644 --- a/ocpmodels/datasets/qm9.py +++ b/ocpmodels/datasets/qm9.py @@ -1,11 +1,12 @@ from pathlib import Path import time - +import json import torch from torch_geometric.datasets import QM9 from ocpmodels.common.registry import registry +from ocpmodels.common.utils import ROOT @registry.register_dataset("qm9") @@ -61,6 +62,20 @@ def __init__( else: self.samples = self.perm[start:end] + self.lse_shifts = None + if self.config.get("lse_shift"): + self.lse_shifts = torch.tensor( + json.loads( + ( + ROOT + / "configs" + / "models" + / "qm9-metadata" + / "lse-shifts-pre-attr.json" + ).read_text() + ) + ) + def close_db(self): pass @@ -76,6 +91,12 @@ def __getitem__(self, idx): data.cell_offsets = torch.zeros((data.edge_index.shape[1], 3)) del data.z data.tags = torch.full((data.natoms,), -1, dtype=torch.long) + + if self.lse_shifts is not None: + data.lse_shift = self.lse_shifts[self.target][data.atomic_numbers].sum() + data.y_unshifted = data.y + data.y = data.y - data.lse_shift + t1 = time.time_ns() if self._transform is not None: data = self._transform(data) diff --git a/scripts/compute_qm9_lse.py b/scripts/compute_qm9_lse.py new file mode 100644 index 0000000000..268f1c2caa --- /dev/null +++ b/scripts/compute_qm9_lse.py @@ -0,0 +1,44 @@ +import json +from pathlib import Path +from tqdm import tqdm +import numpy as np +from sklearn.feature_extraction import DictVectorizer +from torch_geometric.datasets import QM9 + + +def count_fn(y): + return dict(zip(*np.unique(y, return_counts=True))) + + +if __name__ == "__main__": + # from SO3Krates + # https://github.com/thorben-frank/mlff/blob/v0.1/mlff/src/data/preprocessing.py#L297 + base = Path("/network/projects/ocp/qm9") + ds = QM9(base) + + shifts_per_attr = [] + + for attr in tqdm(range(ds[0].y.shape[-1])): + + data = [(d.y[0, attr].numpy(), d.z) for d in ds] + q = np.array([d[0] for d in data]) + max_n_atoms = max([len(d[1]) for d in data]) + z = np.array([np.pad(d[1], (0, max_n_atoms - len(d[1]))) for d in data]) + u = np.unique(z) + idx_ = u != 0 # remove padding with 0 + lhs_counts = list(map(count_fn, z)) + v = DictVectorizer(sparse=False) + X = v.fit_transform(lhs_counts) + X = X[..., idx_] + + sol = np.linalg.lstsq(X, q, rcond=None) + shifts = np.zeros(np.max(u) + 1) + for k, v in dict(zip(u[idx_], sol[0])).items(): + shifts[k] = v + shifts_per_attr.append(shifts.tolist()) + + j_dir = ( + Path(__file__).resolve().parent.parent / "configs" / "models" / "qm9-metadata" + ) + j_dir.mkdir(parents=True, exist_ok=True) + (j_dir / "lse-shifts-pre-attr.json").write_text(json.dumps(shifts_per_attr)) From 296ee5c32864306338c7dbda35d9144cd248db83 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 16:26:48 -0500 Subject: [PATCH 206/273] merge val_ood/train --- configs/models/tasks/qm7x.yaml | 21 +++++++++++---------- ocpmodels/datasets/qm7x.py | 6 +++++- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml index 3408bf5b8f..10919390a2 100644 --- a/configs/models/tasks/qm7x.yaml +++ b/configs/models/tasks/qm7x.yaml @@ -38,6 +38,7 @@ default: std_divider: 1.0 rescale_with_hof: False lse_shift: True + include_val_ood: True val_id: src: /network/projects/ocp/qm7x/processed normalize_labels: True # mean and std of target will be set by utils.py if this is True @@ -46,23 +47,23 @@ default: forces_target: totFOR std_divider: 1.0 lse_shift: True - val_ood: + # val_ood: + # src: /network/projects/ocp/qm7x/processed + # normalize_labels: True # mean and std of target will be set by utils.py if this is True + # split: val_ood + # target: ePBE0+MBD + # forces_target: totFOR + # std_divider: 1.0 + # lse_shift: True + test: src: /network/projects/ocp/qm7x/processed normalize_labels: True # mean and std of target will be set by utils.py if this is True - split: val_ood + split: test target: ePBE0+MBD forces_target: totFOR std_divider: 1.0 lse_shift: True - # TEST SET DO NOT ENABLE - # - src: /network/projects/ocp/qm9 - # target: 7 # predict internal energy at 0K at index 7 - # seed: 123 - # ratio: - # start: 0.85 - # end: 1.0 - # ^`target` is a string to select the target to predict as per # https://arxiv.org/abs/2006.15139 Table 2 diff --git a/ocpmodels/datasets/qm7x.py b/ocpmodels/datasets/qm7x.py index 97c3b85225..7ecb7e0bf3 100644 --- a/ocpmodels/datasets/qm7x.py +++ b/ocpmodels/datasets/qm7x.py @@ -754,9 +754,13 @@ def __init__( split in all_samples["splits"] ), f"split {split} not found in sample mapping" + sample_ids = all_samples["splits"][split] + if self.config.get("include_val_ood"): + sample_ids = sorted(sample_ids + all_samples["splits"]["val_ood"]) + self.keys = [ f'{all_samples["structures"][i][0]}-{all_samples["structures"][i][1]}' - for i in all_samples["splits"][split] + for i in sample_ids ] self.hofs = fetch_table("elements")["heat_of_formation"].values From 7f3041696b51a190ca6172636f7b21bf78f285f1 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 16:27:00 -0500 Subject: [PATCH 207/273] check lse qm9 --- ocpmodels/common/utils.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 8b3ca97fa5..10afc56ce8 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -306,6 +306,13 @@ def set_qm9_target_stats(trainer_config): continue if not dataset.get("normalize_labels", False): continue + elif dataset.get("lse_shift"): + print( + "Setting normalize_labels to False because of lse_shift for split", + f"{d}.", + ) + trainer_config["dataset"][d]["normalize_labels"] = False + continue assert "target" in dataset mean = target_means[dataset["target"]] std = target_stds[dataset["target"]] @@ -354,14 +361,13 @@ def set_qm7x_target_stats(trainer_config): continue if not dataset.get("normalize_labels", False): continue - else: - if dataset.get("lse_shift"): - print( - "Setting normalize_labels to False because of lse_shift for split", - f"{d}.", - ) - trainer_config["dataset"][d]["normalize_labels"] = False - continue + elif dataset.get("lse_shift"): + print( + "Setting normalize_labels to False because of lse_shift for split", + f"{d}.", + ) + trainer_config["dataset"][d]["normalize_labels"] = False + continue assert "target" in dataset, "target must be specified." mean = target_stats[dataset["target"]]["mean"] From dd0816c07ad61675bfe5c0c19b50f85a68dab84f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 16:27:07 -0500 Subject: [PATCH 208/273] format --- configs/models/qm7x-metadata/lse-shifts.json | 21 +++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/configs/models/qm7x-metadata/lse-shifts.json b/configs/models/qm7x-metadata/lse-shifts.json index 8893e2a5cc..7a002ce3dc 100644 --- a/configs/models/qm7x-metadata/lse-shifts.json +++ b/configs/models/qm7x-metadata/lse-shifts.json @@ -1 +1,20 @@ -[0.0, -16.48365429710017, 0.0, 0.0, 0.0, 0.0, -1035.230325647512, -1488.1741712581756, -2045.3532693858685, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -10832.70108036143, -12520.741665730922] \ No newline at end of file +[ + 0.0, + -16.48365429710017, + 0.0, + 0.0, + 0.0, + 0.0, + -1035.230325647512, + -1488.1741712581756, + -2045.3532693858685, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + -10832.70108036143, + -12520.741665730922 +] \ No newline at end of file From e10fd1470e2b6fbc34adf0dbd9365ac627e202a6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 16:27:48 -0500 Subject: [PATCH 209/273] store forces_grad_target even with direct regress_forces --- ocpmodels/models/base_model.py | 6 +++--- ocpmodels/trainers/single_trainer.py | 13 +++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/ocpmodels/models/base_model.py b/ocpmodels/models/base_model.py index cef787d8bd..59648e902b 100644 --- a/ocpmodels/models/base_model.py +++ b/ocpmodels/models/base_model.py @@ -61,9 +61,9 @@ def forward(self, data): elif self.regress_forces in {"direct", "direct_with_gradient_target"}: # predicted forces are the model's direct forces preds["forces"] = forces - if self.regress_forces == "direct_with_gradient_target": - # store the energy gradient as the target - preds["forces_grad_target"] = grad_forces.detach() + # store the energy gradient as the target. Used for metrics + # only in "direct" mode. + preds["forces_grad_target"] = grad_forces.detach() else: raise ValueError( f"Unknown forces regression mode {self.regress_forces}" diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index bdb5a8908f..36d11a1d73 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -541,16 +541,17 @@ def compute_loss(self, preds, batch_list): ) loss["total_loss"].append(force_mult * loss["force_loss"]) if "forces_grad_target" in preds: - energy_grad_mult = self.config["optim"].get( - "energy_grad_coefficient", 10 - ) grad_target = preds["forces_grad_target"] loss["energy_grad_loss"] = self.loss_fn["force"]( preds["forces"][mask], grad_target[mask] ) - loss["total_loss"].append( - energy_grad_mult * loss["energy_grad_loss"] - ) + if self.model.regress_forces == "direct_with_energy_grad": + energy_grad_mult = self.config["optim"].get( + "energy_grad_coefficient", 10 + ) + loss["total_loss"].append( + energy_grad_mult * loss["energy_grad_loss"] + ) # Sanity check to make sure the compute graph is correct. for lc in loss["total_loss"]: assert hasattr(lc, "grad_fn") From 52cd6d96d911134c20a915a2ab97e81985bca773 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 16:27:52 -0500 Subject: [PATCH 210/273] new exps --- configs/exps/icml/qm9/fanet-manual-lse.yaml | 115 +++++++++++++++++++ configs/exps/qm7x/schnet-fanet-lse.yaml | 121 ++++++++++++++------ 2 files changed, 203 insertions(+), 33 deletions(-) create mode 100644 configs/exps/icml/qm9/fanet-manual-lse.yaml diff --git a/configs/exps/icml/qm9/fanet-manual-lse.yaml b/configs/exps/icml/qm9/fanet-manual-lse.yaml new file mode 100644 index 0000000000..877dc62249 --- /dev/null +++ b/configs/exps/icml/qm9/fanet-manual-lse.yaml @@ -0,0 +1,115 @@ +# scheduler reduce lr on plateau +job: + mem: 12GB + cpus: 4 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, fanet-qm9-lse + log_train_every: 200 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type + optim: batch_size, lr_initial + frame_averaging: 3D + fa_frames: random + dataset: + train: + lse_shift: true + val: + lse_shift: true + test: + lse_shift: true + optim: + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + batch_size: 64 + initial_lr: 0.0005 + max_epochs: 1500 + loss_energy: mse + loss_force: mse + # early stopping + es_patience: 20 + es_min_abs_change: 0.000001 + es_warmup_epochs: 500 + # all below is for the ReduceLROnPlateau scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 15 + model: + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: "" + graph_norm: True + hidden_channels: 160 + num_filters: 160 + max_num_neighbors: 30 + mp_type: updownscale_bae + num_gaussians: 50 + num_interactions: 4 + otf_graph: false + pg_hidden_channels: 32 + phys_embeds: true + phys_hidden_channels: 0 + second_layer_MLP: true + skip_co: true + tag_hidden_channels: 0 + use_pbc: false + regress_forces: "" + + +runs: + - {} + - model: + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: "" + graph_norm: false + hidden_channels: 96 + max_num_neighbors: 30 + mp_type: updownscale_bae + num_filters: 224 + num_gaussians: 128 + num_interactions: 4 + phys_embeds: false + second_layer_MLP: false + skip_co: false + - model: + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: "" + graph_norm: true + hidden_channels: 110 + max_num_neighbors: 40 + mp_type: updownscale_bae + num_filters: 384 + num_gaussians: 64 + num_interactions: 4 + phys_embeds: true + second_layer_MLP: true + skip_co: true + - model: + cutoff: 6.0 + edge_embed_type: all_rij + energy_head: "" + graph_norm: true + hidden_channels: 110 + max_num_neighbors: 30 + mp_type: updownscale + num_filters: 192 + num_gaussians: 128 + num_interactions: 5 + phys_embeds: true + second_layer_MLP: false + skip_co: true diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml index 8b6000ea9b..423440f996 100644 --- a/configs/exps/qm7x/schnet-fanet-lse.yaml +++ b/configs/exps/qm7x/schnet-fanet-lse.yaml @@ -4,7 +4,7 @@ job: cpus: 4 gres: gpu:16gb:1 partition: long - code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1 + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 env: ocp-a100 default: @@ -22,17 +22,17 @@ default: cp_data_to_tmpdir: true note: task: name - model: name, num_gaussians, hidden_channels, num_filters, num_interactions - optim: batch_size, lr_initial + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces + optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient optim: batch_size: 10 - max_epochs: 100 + max_steps: 2000000 warmup_steps: 3000 lr_initial: 0.0001 eval_every: 0.251 - energy_coefficient: 0.01 + energy_coefficient: 1 energy_grad_coefficient: 0 - force_coefficient: 0.99 + force_coefficient: 100 # parameters EMA ema_decay: 0.999 loss_energy: mae @@ -63,26 +63,45 @@ default: runs: - config: schnet-qm7x-all - + - config: schnet-qm7x-all + model: + regress_forces: "" + - config: schnet-qm7x-all + optim: + energy_coefficient: 0 - config: schnet-qm7x-all optim: batch_size: 100 + max_steps: 1000000 + lr_initial: 0.0003 - config: fanet-qm7x-all + optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 model: + num_interactions: 4 + hidden_channels: 160 + num_filters: 160 graph_norm: true edge_embed_type: all_rij mp_type: updownscale_base + force_decoder_type: mlp + regress_forces: direct_with_gradient_target - config: fanet-qm7x-all optim: - batch_size: 100 - energy_coefficient: 0.01 - energy_grad_coefficient: 0.1 - force_coefficient: 0.89 + batch_size: 50 + max_steps: 1000000 + initial_lr: 0.0005 + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 model: - hidden_channels: 256 - num_filters: 256 + num_interactions: 4 + hidden_channels: 160 + num_filters: 160 graph_norm: true edge_embed_type: all_rij mp_type: updownscale_base @@ -92,44 +111,80 @@ runs: - config: fanet-qm7x-all optim: batch_size: 100 - energy_coefficient: 0.01 - energy_grad_coefficient: 0 - force_coefficient: 0.99 + initial_lr: 0.001 + max_steps: 1000000 + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 model: - graph_norm: false - force_decoder_type: mlp + num_interactions: 4 + hidden_channels: 160 + num_filters: 160 + graph_norm: true edge_embed_type: all_rij - regress_forces: direct mp_type: updownscale_base - num_interactions: 4 - regress_forces: direct + force_decoder_type: mlp + regress_forces: direct_with_gradient_target - config: fanet-qm7x-all optim: - batch_size: 100 - energy_coefficient: 0.01 + batch_size: 50 + initial_lr: 0.0005 + max_steps: 1000000 + energy_coefficient: 1 energy_grad_coefficient: 0 - force_coefficient: 0.99 + force_coefficient: 100 model: - hidden_channels: 256 - num_filters: 256 + num_interactions: 4 + hidden_channels: 160 + num_filters: 160 graph_norm: true edge_embed_type: all_rij mp_type: updownscale_base force_decoder_type: mlp - regress_forces: from_energy + regress_forces: direct - config: fanet-qm7x-all optim: - batch_size: 100 + batch_size: 50 + initial_lr: 0.0005 + max_steps: 1000000 energy_coefficient: 1 energy_grad_coefficient: 0 - force_coefficient: 0 + force_coefficient: 100 model: - graph_norm: false - force_decoder_type: mlp + num_interactions: 4 + hidden_channels: 160 + num_filters: 160 + graph_norm: true edge_embed_type: all_rij + mp_type: simple + complex_mp: true + second_layer_mlp: true + force_decoder_type: mlp regress_forces: direct - mp_type: updownscale_base + + - config: fanet-qm7x-all + optim: + batch_size: 50 + initial_lr: 0.0005 + max_steps: 1000000 + energy_coefficient: 0 + energy_grad_coefficient: 0 + force_coefficient: 1 + model: num_interactions: 4 - regress_forces: "" + hidden_channels: 160 + num_filters: 160 + graph_norm: true + edge_embed_type: all_rij + mp_type: updownscale_base + force_decoder_type: mlp + regress_forces: direct + + + + + + + From 3345a29f987aa8f603bdf03cbd632e90324a7eea Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 17:34:50 -0500 Subject: [PATCH 211/273] fix energy grad logging --- ocpmodels/models/base_model.py | 18 ++++++++---------- ocpmodels/trainers/base_trainer.py | 4 ++-- ocpmodels/trainers/single_trainer.py | 5 +++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/ocpmodels/models/base_model.py b/ocpmodels/models/base_model.py index 59648e902b..3a0ac3a93f 100644 --- a/ocpmodels/models/base_model.py +++ b/ocpmodels/models/base_model.py @@ -34,9 +34,8 @@ def forces_forward(self, preds): def forward(self, data): grad_forces = forces = None - if self.regress_forces in {"from_energy", "direct_with_gradient_target"}: - # energy gradient w.r.t. positions will be computed - data.pos.requires_grad_(True) + # energy gradient w.r.t. positions will be computed + data.pos.requires_grad_(True) # predict energy preds = self.energy_forward(data) @@ -47,13 +46,12 @@ def forward(self, data): # predict forces forces = self.forces_forward(preds) - if self.regress_forces in {"from_energy", "direct_with_gradient_target"}: - if "gemnet" in self.__class__.__name__.lower(): - # gemnet forces are already computed - grad_forces = forces - else: - # compute forces from energy gradient - grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"]) + if "gemnet" in self.__class__.__name__.lower(): + # gemnet forces are already computed + grad_forces = forces + else: + # compute forces from energy gradient + grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"]) if self.regress_forces == "from_energy": # predicted forces are the energy gradient diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 901908345a..745fcf06d9 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -610,7 +610,7 @@ def validate( ): if dist_utils.is_master() and not self.silent: print() - logging.info(f"🧐 Evaluating on {split}.") + logging.info(f"\n >>> 🧐 Evaluating on {split}.") if self.is_hpo: disable_tqdm = True @@ -679,7 +679,7 @@ def validate( if dist_utils.is_master() and not self.silent: log_str = ["{}: {:.4f}".format(k, v) for k, v in log_dict.items()] - print("\n > ".join([""] + log_str)) + print(("\n > ".join([""] + log_str))[1:]) print() # Make plots. diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 36d11a1d73..c8a35a7e90 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -545,7 +545,7 @@ def compute_loss(self, preds, batch_list): loss["energy_grad_loss"] = self.loss_fn["force"]( preds["forces"][mask], grad_target[mask] ) - if self.model.regress_forces == "direct_with_energy_grad": + if self.model.module.regress_forces == "direct_with_energy_grad": energy_grad_mult = self.config["optim"].get( "energy_grad_coefficient", 10 ) @@ -655,7 +655,8 @@ def log_train_metrics(self, end_of_epoch=False): if not self.silent: log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()] print( - f"Train metrics at step {self.step}:\n > " + "\n > ".join(log_str) + f"\nTrain metrics at step {self.step}:\n > " + + "\n > ".join(log_str) ) self.metrics = {} From a0cd09d065636765f66b1cba7d7eda9ac38eff43 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 19:49:19 -0500 Subject: [PATCH 212/273] typo --- scripts/compute_is2re_lse.py | 47 ++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 scripts/compute_is2re_lse.py diff --git a/scripts/compute_is2re_lse.py b/scripts/compute_is2re_lse.py new file mode 100644 index 0000000000..924a8d6c53 --- /dev/null +++ b/scripts/compute_is2re_lse.py @@ -0,0 +1,47 @@ +import json +from pathlib import Path +import h5py +from tqdm import tqdm +import numpy as np +from sklearn.feature_extraction import DictVectorizer +import sys + +sys.path.append(str(Path(__file__).resolve().parent.parent)) + +from ocpmodels.datasets.lmdb_dataset import LmdbDataset + + +def count_fn(y): + return dict(zip(*np.unique(y, return_counts=True))) + + +if __name__ == "__main__": + # from SO3Krates + # https://github.com/thorben-frank/mlff/blob/v0.1/mlff/src/data/preprocessing.py#L297 + ds = LmdbDataset({"src": "/network/projects/ocp/oc20/is2re/all/train/"}) + data = [(d["y"], d["atomic_numbers"]) for d in tqdm(ds, total=len(ds))] + + q = np.array([d[0].item() for d in data]) + max_n_atoms = max([len(d[1]) for d in data]) + z = np.array([np.pad(d[1], (0, max_n_atoms - len(d[1]))) for d in data]) + u = np.unique(z) + idx_ = u != 0 # remove padding with 0 + lhs_counts = list(map(count_fn, z)) + v = DictVectorizer(sparse=False) + X = v.fit_transform(lhs_counts) + X = X[..., idx_] + + sol = np.linalg.lstsq(X, q, rcond=None) + shifts = np.zeros(np.max(u) + 1) + for k, v in dict(zip(u[idx_], sol[0])).items(): + shifts[k] = v + + ( + Path("/home/mila/s/schmidtv/ocp-project/ocp-drlab") + / "configs" + / "models" + / "is2re-metadata" + / "lse-shifts.json" + ).write_text(json.dumps(shifts.tolist())) + + q_shifts = shifts[z].sum(-1) From d7e720688390c54d1c074eefe2394bfaa8f658ec Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Fri, 20 Jan 2023 19:49:48 -0500 Subject: [PATCH 213/273] typo --- ocpmodels/common/exp_manager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index 6a2bead459..fb2005a7b3 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -13,7 +13,8 @@ sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) -from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path +from ocpmodels.common.utils import ROOT, RUN_DIR +from ocpmodels.common.orion_utils import get_and_move_orion_db_path EXP_OUT_DIR = ROOT / "data" / "exp_outputs" MANAGER_CACHE = ROOT / "data" / "exp_manager_cache" From c66c4e1b640d621907578ff189fd7173cce305d4 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 13:15:41 -0500 Subject: [PATCH 214/273] sbatch resume --- configs/sbatch/defaults.yaml | 3 ++ main.py | 2 +- ocpmodels/common/exp_manager.py | 2 +- ocpmodels/common/utils.py | 28 +++++++++++----- sbatch.py | 59 +++++++++++++++++++++++++++++++-- 5 files changed, 81 insertions(+), 13 deletions(-) diff --git a/configs/sbatch/defaults.yaml b/configs/sbatch/defaults.yaml index 97afac49df..d00797d755 100644 --- a/configs/sbatch/defaults.yaml +++ b/configs/sbatch/defaults.yaml @@ -28,6 +28,9 @@ code_loc: null # code location. Defaults to the current repository path output: "$SCRATCH/ocp/runs/%j/output-%t.txt" # slurm output file per task (%t) logdir: "$SCRATCH/ocp/runs/$SLURM_JOB_ID" # --logdir value for main.py, appended to py_args if not already present +continue_from_dir: null +restart_from_dir: null + env: "ocp" # env name for `conda activate {env}` py_args: "" # arguments for main.py note: "" # wandb run note diff --git a/main.py b/main.py index 01fc6c3c6d..cb81c7fb61 100644 --- a/main.py +++ b/main.py @@ -109,7 +109,7 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None): # -- Initial setup setup_imports() - print("All things imported.\n") + print("\n🚩 All things imported.\n") start_time = time.time() try: diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py index fb2005a7b3..0106863cd0 100644 --- a/ocpmodels/common/exp_manager.py +++ b/ocpmodels/common/exp_manager.py @@ -235,7 +235,7 @@ def parse_output_files(self): elif "srun: Job step aborted" in out_txt: if "slurmstepd" in out_txt and " CANCELLED AT " in out_txt: self.cache["job_state"][j] = "Cancelled" - elif "nan_loss" in out_txt: + elif "Loss is NaN. Stopping training." in out_txt: self.cache["job_state"][j] = "NaN loss" else: self.cache["job_state"][j] = "Unknown" diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 10afc56ce8..57e91e2944 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -164,7 +164,7 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config): ): return trainer_config - print("\nMoving data to slurm tmpdir", flush=True) + print("\n🚉 Copying data to slurm tmpdir", flush=True) tmp_dir = os.environ.get("SLURM_TMPDIR") or f"/Tmp/slurm.{JOB_ID}.0" tmp_dir = Path(tmp_dir) @@ -177,21 +177,21 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config): new_dir = tmp_dir / original.name if new_dir.exists(): print( - f"Data already copied to {str(new_dir)} for split", + f" Data already copied to {str(new_dir)} for split", f"{s} with source path {split['src']}", flush=True, ) trainer_config["dataset"][s]["src"] = str(new_dir) continue - print("Making new_dir: ", str(new_dir), flush=True) + print(" Making new_dir: ", str(new_dir), flush=True) new_dir.mkdir() command = ["cp", "-r", f"{str(original)}", str(new_dir.parent)] - print("Copying data: ", " ".join(command), flush=True) + print(" Copying data: ", " ".join(command), flush=True) subprocess.run(command) for f in new_dir.glob("*.lmdb-lock"): f.unlink() trainer_config["dataset"][s]["src"] = str(new_dir) - print("Done moving data to", str(new_dir), flush=True) + print(" Done moving data to", str(new_dir), flush=True) return trainer_config @@ -877,7 +877,7 @@ def set_cpus_to_workers(config): workers = cpus // gpus if not config["silent"]: print( - f"Overriding num_workers from {config['optim']['num_workers']}", + f"🏭 Overriding num_workers from {config['optim']['num_workers']}", f"to {workers} to match the machine's CPUs.", "Use --no_cpus_to_workers=true to disable this behavior.", ) @@ -1002,7 +1002,7 @@ def build_config(args, args_override): dels[k] = copy.deepcopy(continue_config[k]) continue_config[k] = None print( - "Removing orion config from continue config. Set to None:", + "🅾️ Removing orion config from continue config. Set to None:", "{" + ", ".join([f"{k}: {v}->None" for k, v in dels.items()]) + "}", ) print( @@ -1032,11 +1032,23 @@ def build_config(args, args_override): config["world_size"] = args.num_nodes * args.num_gpus if continue_config: - new_dirs = [(k, v) for k, v in config.items() if "dir" in k] + new_dirs = [ + (k, v) for k, v in config.items() if "dir" in k and k != "cp_data_to_tmpdir" + ] + data_srcs = copy.deepcopy( + { + k: { + "src": v["src"] + } # keep original src, if data was moved in the resumed exp + for k, v in config["dataset"].items() + if isinstance(v, dict) and "src" in v + } + ) config = merge_dicts( continue_config, {k: resolve(v) if isinstance(v, str) else v for k, v in new_dirs}, ) + config["dataset"] = merge_dicts(config["dataset"], data_srcs) cli = cli_args_dict() if "max_steps" in cli.get("optim", {}): if "max_epochs" in cli.get("optim", {}): diff --git a/sbatch.py b/sbatch.py index bf17bd5c6f..b7a4d174d7 100644 --- a/sbatch.py +++ b/sbatch.py @@ -1,4 +1,4 @@ -from minydra import resolved_args +from minydra import resolved_args, MinyDict from pathlib import Path from datetime import datetime import os @@ -77,7 +77,9 @@ def discover_minydra_defaults(): user_config = root / "configs" / "sbatch" / f"{os.environ['USER']}.yaml" if user_config.exists() and user_config.is_file(): defaults.append(user_config) - return defaults + return MinyDict( + {k: v for d in defaults for k, v in yaml.safe_load(d.read_text()).items()} + ) def resolve(path): @@ -214,17 +216,68 @@ def write_orion_config(args, outdir): (outdir / f"{unique_exp_name}.exp").touch() +def load_sbatch_args_from_dir(dir): + dir = resolve(dir) + sbatch_files = list(dir.glob("sbatch_*.sh")) + if not sbatch_files: + raise FileNotFoundError(f"No sbatch file found in {str(dir)}") + sbatch_file = sbatch_files[0] + sbatch_lines = [ + line.split("#SBATCH")[1].strip() + for line in sbatch_file.read_text().splitlines() + if "#SBATCH " in line + ] + sbatch_args = {} + for line in sbatch_lines: + k, v = ( + line[2:] + if line.startswith("--") + else line[1:] + if line.startswith("-") + else line + ).split("=") + sbatch_args[k] = v + args = { + "job_name": sbatch_args["job-name"], + "nodes": int(sbatch_args["nodes"]), + "ntasks_per_node": int(sbatch_args["ntasks-per-node"]), + "partition": sbatch_args["partition"], + "cpus": int(sbatch_args["cpus-per-task"]), + "mem": sbatch_args["mem"], + "gres": sbatch_args["gres"], + "output": sbatch_args["output"], + } + return args + + if __name__ == "__main__": # has the submission been successful? success = False wandb_offline = "" sbatch_py_vars = {} + minydra_defaults = discover_minydra_defaults() # repository root root = Path(__file__).resolve().parent # parse and resolve args. # defaults are loaded and overwritten from the command-line as `arg=value` - args = resolved_args(defaults=discover_minydra_defaults()) + args = resolved_args(defaults=minydra_defaults) + + if args.restart_from_dir or args.continue_from_dir: + if args.restart_from_dir and args.continue_from_dir: + raise ValueError( + "Cannot restart and continue from the same directory. " + "Please specify only one of restart_from_dir= or continue_from_dir=" + ) + resume_dir = args.restart_from_dir or args.continue_from_dir + mode = "restart" if args.restart_from_dir else "continue" + sba = load_sbatch_args_from_dir(resume_dir) + cli_sba = {k: v for k, v in args.items() if v != minydra_defaults[k]} + args = MinyDict({**args, **sba, **cli_sba}) + if not args.py_args: + args.py_args = "" + args.py_args += f" --{mode}_from_dir={str(resume_dir)}" + modules = ( [] if not args.modules From e6a754dd697dc7d5fbdd482691a534e465aa2cd6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 13:58:40 -0500 Subject: [PATCH 215/273] enable gradient to log force energy grad --- ocpmodels/trainers/base_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 745fcf06d9..2c98c45a27 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -599,7 +599,6 @@ def train(self): """Derived classes should implement this function.""" pass - @torch.no_grad() def validate( self, split="val", @@ -608,6 +607,7 @@ def validate( is_final=False, is_first=False, ): + torch.set_grad_enabled(bool(self.config["model"].get("regress_forces", ""))) if dist_utils.is_master() and not self.silent: print() logging.info(f"\n >>> 🧐 Evaluating on {split}.") From 13bc4b5c9322c3d704f08797cb7ce53340184fc0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 17:47:10 -0500 Subject: [PATCH 216/273] update --- .../exps/icml/qm9/fanet-best-all-targets.yaml | 207 ++++++++++++++++++ configs/exps/icml/qm9/fanet-manual-lse.yaml | 6 +- configs/exps/qm7x/schnet-fanet-lse.yaml | 8 +- configs/models/is2re-metadata/lse-shifts.json | 1 + launch_exp.py | 16 +- ocpmodels/trainers/single_trainer.py | 12 +- 6 files changed, 231 insertions(+), 19 deletions(-) create mode 100644 configs/exps/icml/qm9/fanet-best-all-targets.yaml create mode 100644 configs/models/is2re-metadata/lse-shifts.json diff --git a/configs/exps/icml/qm9/fanet-best-all-targets.yaml b/configs/exps/icml/qm9/fanet-best-all-targets.yaml new file mode 100644 index 0000000000..94a35433fb --- /dev/null +++ b/configs/exps/icml/qm9/fanet-best-all-targets.yaml @@ -0,0 +1,207 @@ +# scheduler reduce lr on plateau +job: + mem: 12GB + cpus: 4 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + # dev: true + # verbose: true + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, fanet-qm9-lse + log_train_every: 200 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type + optim: batch_size, lr_initial + frame_averaging: 3D + fa_frames: random + dataset: + train: + lse_shift: true + val: + lse_shift: true + test: + lse_shift: true + optim: + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + batch_size: 64 + initial_lr: 0.0005 + max_epochs: 1500 + loss_energy: mse + loss_force: mse + # early stopping + es_patience: 20 + es_min_abs_change: 0.000001 + es_warmup_epochs: 500 + # all below is for the ReduceLROnPlateau scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 15 + model: + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: "" + graph_norm: True + hidden_channels: 110 + max_num_neighbors: 40 + mp_type: updownscale_base + num_filters: 384 + num_gaussians: 64 + num_interactions: 4 + otf_graph: false + pg_hidden_channels: 32 + phys_embeds: true + phys_hidden_channels: 0 + regress_forces: "" + second_layer_MLP: true + skip_co: true + tag_hidden_channels: 0 + use_pbc: false + + +runs: + - dataset: + train: + target: 0 + val: + target: 0 + test: + target: 0 + - dataset: + train: + target: 1 + val: + target: 1 + test: + target: 1 + - dataset: + train: + target: 2 + val: + target: 2 + test: + target: 2 + - dataset: + train: + target: 3 + val: + target: 3 + test: + target: 3 + - dataset: + train: + target: 4 + val: + target: 4 + test: + target: 4 + - dataset: + train: + target: 5 + val: + target: 5 + test: + target: 5 + - dataset: + train: + target: 6 + val: + target: 6 + test: + target: 6 + - dataset: + train: + target: 7 + val: + target: 7 + test: + target: 7 + - dataset: + train: + target: 8 + val: + target: 8 + test: + target: 8 + - dataset: + train: + target: 9 + val: + target: 9 + test: + target: 9 + - dataset: + train: + target: 10 + val: + target: 10 + test: + target: 10 + - dataset: + train: + target: 11 + val: + target: 11 + test: + target: 11 + - dataset: + train: + target: 12 + val: + target: 12 + test: + target: 12 + - dataset: + train: + target: 13 + val: + target: 13 + test: + target: 13 + - dataset: + train: + target: 14 + val: + target: 14 + test: + target: 14 + - dataset: + train: + target: 15 + val: + target: 15 + test: + target: 15 + - dataset: + train: + target: 16 + val: + target: 16 + test: + target: 16 + - dataset: + train: + target: 17 + val: + target: 17 + test: + target: 17 + - dataset: + train: + target: 18 + val: + target: 18 + test: + target: 18 diff --git a/configs/exps/icml/qm9/fanet-manual-lse.yaml b/configs/exps/icml/qm9/fanet-manual-lse.yaml index 877dc62249..d3ddd8f39b 100644 --- a/configs/exps/icml/qm9/fanet-manual-lse.yaml +++ b/configs/exps/icml/qm9/fanet-manual-lse.yaml @@ -55,7 +55,7 @@ default: hidden_channels: 160 num_filters: 160 max_num_neighbors: 30 - mp_type: updownscale_bae + mp_type: updownscale_base num_gaussians: 50 num_interactions: 4 otf_graph: false @@ -78,7 +78,7 @@ runs: graph_norm: false hidden_channels: 96 max_num_neighbors: 30 - mp_type: updownscale_bae + mp_type: updownscale_base num_filters: 224 num_gaussians: 128 num_interactions: 4 @@ -92,7 +92,7 @@ runs: graph_norm: true hidden_channels: 110 max_num_neighbors: 40 - mp_type: updownscale_bae + mp_type: updownscale_base num_filters: 384 num_gaussians: 64 num_interactions: 4 diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml index 423440f996..26fa07d862 100644 --- a/configs/exps/qm7x/schnet-fanet-lse.yaml +++ b/configs/exps/qm7x/schnet-fanet-lse.yaml @@ -58,7 +58,7 @@ default: lse_shift: True val_id: lse_shift: True - val_ood: + test: lse_shift: True runs: @@ -182,9 +182,3 @@ runs: force_decoder_type: mlp regress_forces: direct - - - - - - diff --git a/configs/models/is2re-metadata/lse-shifts.json b/configs/models/is2re-metadata/lse-shifts.json new file mode 100644 index 0000000000..153cfc0851 --- /dev/null +++ b/configs/models/is2re-metadata/lse-shifts.json @@ -0,0 +1 @@ +[0.0, -0.03168837170037106, 0.0, 0.0, 0.0, -0.018916724897170528, -0.07516872833312764, 0.0016345619191071165, 0.013287692526233786, 0.0, 0.0, -0.010099893232592166, 0.0, -0.013262464762207531, -0.0040779598604658, -0.002059577810890844, 0.0022706829024690778, 0.0028789674984750213, 0.0, -0.015036774393646663, -0.043021322980277346, -0.06111072563346156, -0.04210049361780378, -0.03422540877588413, -0.022856732968670863, -0.03103305334142486, -0.026134560547974663, -0.019305524888569604, -0.013493352269468968, -0.008163195381331444, -0.010904761470386284, -0.00405098156937312, 0.001852697634236705, 0.0020690026083794895, 0.0035793557500934155, 0.0, 0.0, -0.013383204575329636, -0.03858810522009537, -0.0576401252068183, -0.059088351366679136, -0.036947509002007, -0.027747679316385585, -0.02807965978344211, -0.01876626301744238, -0.011546582100635678, -0.0055021423373771625, -0.0001371757442072078, -0.006848676469819966, -0.0014570150904904752, -0.0001179312223327613, 0.0002476578996626392, 0.005820879587264792, 0.0, 0.0, -0.016141335226783824, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.052932107762963754, -0.039172881142421144, -0.025784458949892612, -0.028377167031415778, -0.02227405667797154, -0.013746130132304628, -0.007585348194747942, -0.002274605617351183, -0.0002101065095853269, 0.001659669572096274, 0.0035125185202919567, 0.0028364718870182238] \ No newline at end of file diff --git a/launch_exp.py b/launch_exp.py index 7fa87240b8..87046a1e6a 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -257,6 +257,8 @@ def get_args_or_exp(key, args, exp): for c, command in enumerate(commands): print(f"Launching job {c+1:3}", end="\r") outputs.append(os.popen(command).read().strip()) + if " verbose=true" in command.lower(): + print(outputs[-1]) except KeyboardInterrupt: is_interrupted = True outdir = ROOT / "data" / "exp_outputs" / exp_name @@ -276,12 +278,12 @@ def get_args_or_exp(key, args, exp): with outfile.open("w") as f: f.write(text) print("\n\n ✅ Done!") - print(util_strings(jobs)) - # print(f" • Output written to {str(outfile)}") - yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs) - print( - " • Experiment summary YAML in ", - f"./{str(yml_out.relative_to(Path.cwd()))}", - ) + if jobs: + print(util_strings(jobs)) + yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs) + print( + " • Experiment summary YAML in ", + f"./{str(yml_out.relative_to(Path.cwd()))}", + ) else: print("Aborting") diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index c8a35a7e90..416af29799 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -271,8 +271,16 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): print("\n\n >>> 🛑 Loss is NaN. Stopping training.\n\n") self.logger.add_tags(["nan_loss"]) return "loss_is_nan" - self._backward(loss) - + try: + self._backward(loss) + except RuntimeError: + print("\nBackward loss issue") + print(loss) + print( + "Requires grad:", + {k: v.requires_grad for k, v in self.loss.items()}, + ) + print() # Compute metrics. self.metrics = self.compute_metrics( preds, From 2a482da2d85fe6e6860f51bc91d6b316d1af5f8f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 18:21:15 -0500 Subject: [PATCH 217/273] update orion search --- configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 73 +++++++++++++++++++ configs/models/tasks/qm9.yaml | 6 +- 2 files changed, 76 insertions(+), 3 deletions(-) create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml new file mode 100644 index 0000000000..7b9a6bd8ea --- /dev/null +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml @@ -0,0 +1,73 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 8GB + cpus: 4 + gres: gpu:1 + time: 02:50:00 + partition: long + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, orion + log_train_every: 200 + optim: + batch_size: 64 + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + loss_energy: mse + # early stopping + es_patience: 20 + es_min_abs_change: 0.000001 + es_warmup_epochs: 600 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 15 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: num_filters, pg_hidden_channels, num_gaussians + frame_averaging: 3D + fa_frames: random + model: + mp_type: updownscale_base + edge_embed_type: all_rij + energy_head: "" + num_gaussians: 100 + pg_hidden_channels: 32 + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 150 + + unique_exp_name: fanet-qm9-v6.0.0 + + space: + optim/max_epochs: fidelity(600, 1000, base=12) + optim/lr_initial: loguniform(1e-4, 1e-3, precision=3) + model/cutoff: uniform(5, 6, precision=1) + model/graph_norm: choices([True, False]) + model/hidden_channels: choices([100, 200, 300, 400, 500, 1000]) + model/max_num_neighbours: choices([30, 40, 50]) + model/num_filters: uniform(7, 16, discrete=True) + model/num_interactions: uniform(3, 5, discrete=True) + model/phys_embeds: choices([True, False]) + model/second_layer_mlp: choices([True, False]) + model/skip_co: choices([True, False]) + algorithms: + asha: + seed: 123 + num_rungs: 3 + num_brackets: 2 diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml index 42b4256a1e..ecdc1d1dac 100644 --- a/configs/models/tasks/qm9.yaml +++ b/configs/models/tasks/qm9.yaml @@ -30,7 +30,7 @@ default: target: 12 # predict atomization energy at 0K at index 12 seed: 123 normalize_labels: True # mean and std of target will be set by qm9.py if this is True - lse_shift: false + lse_shift: true indices: start: 0 end: 110000 @@ -38,7 +38,7 @@ default: src: /network/projects/ocp/qm9 target: 12 # predict atomization energy at 0K at index 12 seed: 123 - lse_shift: false + lse_shift: true indices: start: 110000 end: 120000 @@ -46,7 +46,7 @@ default: src: /network/projects/ocp/qm9 target: 12 # predict atomization energy at 0K at index 12 seed: 123 - lse_shift: false + lse_shift: true indices: start: 120000 end: -1 From a59b58271bb9c2c4e42c6c0a33d39363b0750e5b Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 18:22:01 -0500 Subject: [PATCH 218/273] slightly larger time --- configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml index 7b9a6bd8ea..d555ad0351 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml @@ -3,7 +3,7 @@ job: mem: 8GB cpus: 4 gres: gpu:1 - time: 02:50:00 + time: 02:55:00 partition: long default: From 64f63a3304771893a43eb1031cd216316460f2f3 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 18:56:31 -0500 Subject: [PATCH 219/273] typo --- configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml index d555ad0351..b8d1f1c396 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml @@ -52,7 +52,7 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 150 - unique_exp_name: fanet-qm9-v6.0.0 + unique_exp_name: fanet-qm9-v6.0.1 space: optim/max_epochs: fidelity(600, 1000, base=12) @@ -60,7 +60,7 @@ orion: model/cutoff: uniform(5, 6, precision=1) model/graph_norm: choices([True, False]) model/hidden_channels: choices([100, 200, 300, 400, 500, 1000]) - model/max_num_neighbours: choices([30, 40, 50]) + model/max_num_neighbors: choices([30, 40, 50]) model/num_filters: uniform(7, 16, discrete=True) model/num_interactions: uniform(3, 5, discrete=True) model/phys_embeds: choices([True, False]) From ecd8b5eb10dd3d4acc9453702b8dd1dd850d5d8c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 19:12:32 -0500 Subject: [PATCH 220/273] early-stopping file --- ocpmodels/trainers/base_trainer.py | 8 +++++++- ocpmodels/trainers/single_trainer.py | 17 ++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 2c98c45a27..e816c5239e 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -25,7 +25,7 @@ from torch.utils.data import DataLoader from torch_geometric.data import Batch from tqdm import tqdm - +from uuid import uuid4 from ocpmodels.common import dist_utils from ocpmodels.common.data_parallel import ( BalancedBatchSampler, @@ -54,6 +54,7 @@ def __init__(self, **kwargs): model_name = kwargs["model"].pop( "name", kwargs.get("model_name", "Unknown - base_trainer issue") ) + self.early_stopping_file = resolve(run_dir) / f"{str(uuid4())}.stop" kwargs["model"]["graph_rewiring"] = kwargs.get("graph_rewiring") self.config = { @@ -64,6 +65,7 @@ def __init__(self, **kwargs): "checkpoint_dir": str(resolve(run_dir) / "checkpoints"), "results_dir": str(resolve(run_dir) / "results"), "logs_dir": str(resolve(run_dir) / "logs"), + "early_stopping_file": str(self.early_stopping_file), } self.sigterm = False @@ -147,6 +149,10 @@ def __init__(self, **kwargs): if dist_utils.is_master() and not self.silent: print(f"\n🧰 Trainer config:\n{'-'*18}\n") print(yaml.dump(self.config), end="\n\n") + print( + f"\n\n🚦 Create {str(self.early_stopping_file)}", + "to stop the training after the next validation\n", + ) self.load() self.evaluator = Evaluator( diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 416af29799..3657c8b686 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -346,10 +346,21 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): checkpoint_file="best_checkpoint.pt", training_state=False, ) - if self.early_stopper.should_stop( - current_val_metric, self.scheduler.get_lr(), self.epoch + if ( + self.early_stopper.should_stop( + current_val_metric, self.scheduler.get_lr(), self.epoch + ) + or self.early_stopping_file.exists() ): - print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n") + if self.early_stopping_file.exists(): + print("\n\n >>> 🛑 Early stopping file found.\n\n") + self.early_stopping_file.rename( + self.early_stopping_file.parent + / f"{self.early_stopping_file.stem}_{self.now}.txt" + ) + else: + print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n") + if self.logger: self.logger.add_tags(["E-S"]) return self.end_of_training( From 1e9c96620c53e9c9cbe4554d8e0580f65fa28dee Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 19:17:20 -0500 Subject: [PATCH 221/273] fix post val grad --- ocpmodels/trainers/base_trainer.py | 1 + ocpmodels/trainers/single_trainer.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index e816c5239e..2bc1a575c3 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -705,6 +705,7 @@ def validate( if self.ema: self.ema.restore() + torch.set_grad_enabled(True) return metrics @abstractmethod diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 3657c8b686..d73a5f2743 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -271,6 +271,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): print("\n\n >>> 🛑 Loss is NaN. Stopping training.\n\n") self.logger.add_tags(["nan_loss"]) return "loss_is_nan" + try: self._backward(loss) except RuntimeError: @@ -278,7 +279,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): print(loss) print( "Requires grad:", - {k: v.requires_grad for k, v in self.loss.items()}, + {k: v.requires_grad for k, v in loss.items()}, ) print() # Compute metrics. From be17a53bf9e857410a99a95388ef5545350b5fc7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 19:29:41 -0500 Subject: [PATCH 222/273] typo `second_layer_MLP` --- configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml index b8d1f1c396..b0af9eec5b 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml @@ -52,7 +52,7 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 150 - unique_exp_name: fanet-qm9-v6.0.1 + unique_exp_name: fanet-qm9-v6.0.2 space: optim/max_epochs: fidelity(600, 1000, base=12) @@ -64,7 +64,7 @@ orion: model/num_filters: uniform(7, 16, discrete=True) model/num_interactions: uniform(3, 5, discrete=True) model/phys_embeds: choices([True, False]) - model/second_layer_mlp: choices([True, False]) + model/second_layer_MLP: choices([True, False]) model/skip_co: choices([True, False]) algorithms: asha: From a932725e76d3ea4326f84e0dd29825bbb2fb850f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 19:39:15 -0500 Subject: [PATCH 223/273] new qm7x exp --- configs/exps/qm7x/fanet-lse-v1.yaml | 150 ++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 configs/exps/qm7x/fanet-lse-v1.yaml diff --git a/configs/exps/qm7x/fanet-lse-v1.yaml b/configs/exps/qm7x/fanet-lse-v1.yaml new file mode 100644 index 0000000000..5dd3da7def --- /dev/null +++ b/configs/exps/qm7x/fanet-lse-v1.yaml @@ -0,0 +1,150 @@ +# trainset has 4068193 samples +job: + mem: 32GB + cpus: 4 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + env: ocp-a100 + +default: + config: fanet-qm7x-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x + phys_hidden_channels: 0 + phys_embeds: False + energy_head: False + pg_hidden_channels: 0 + tag_hidden_channels: 0 + frame_averaging: "" + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces + optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient + optim: + batch_size: 100 + max_steps: 2000000 + warmup_steps: 3000 + lr_initial: 0.00025 + eval_every: 0.201 + energy_coefficient: 1 + energy_grad_coefficient: 0 + force_coefficient: 100 + # parameters EMA + ema_decay: 0.999 + loss_energy: mae + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + model: + att_heads: 1 + complex_mp: false + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: false + force_decoder_type: mlp + graph_norm: false + hidden_channels: 128 + max_num_neighbors: 40 + mp_type: updownscale_base + num_filters: 128 + num_gaussians: 20 + num_interactions: 4 + pg_hidden_channels: 0 + phys_embeds: false + second_layer_MLP: false + skip_co: false + tag_hidden_channels: 0 + dataset: + train: + rescale_with_hof: False + lse_shift: True + val_id: + lse_shift: True + test: + lse_shift: True + +runs: + - {} + - model: + regress_forces: direct_with_gradient_target + + - model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + + - model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + + - model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + + - model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + + - model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + complex_mlp: true + + - model: + num_interactions: 4 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + complex_mlp: true + + - model: + num_interactions: 4 + num_filters: 256 + hidden_channels: 256 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + complex_mlp: true + + - model: + num_interactions: 4 + num_filters: 256 + hidden_channels: 256 + num_gaussians: 20 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + complex_mlp: true + From a116ed2f92965e943fb513c39d2f11842a9eab9f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sat, 21 Jan 2023 20:17:19 -0500 Subject: [PATCH 224/273] update exps --- ...ts.yaml => fanet-best-v5-all-targets.yaml} | 0 .../icml/qm9/fanet-manual-lse-best-v5.yaml | 217 ++++++++++++++++++ configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 1 + configs/exps/qm7x/fanet-lse-v1.yaml | 23 +- 4 files changed, 237 insertions(+), 4 deletions(-) rename configs/exps/icml/qm9/{fanet-best-all-targets.yaml => fanet-best-v5-all-targets.yaml} (100%) create mode 100644 configs/exps/icml/qm9/fanet-manual-lse-best-v5.yaml diff --git a/configs/exps/icml/qm9/fanet-best-all-targets.yaml b/configs/exps/icml/qm9/fanet-best-v5-all-targets.yaml similarity index 100% rename from configs/exps/icml/qm9/fanet-best-all-targets.yaml rename to configs/exps/icml/qm9/fanet-best-v5-all-targets.yaml diff --git a/configs/exps/icml/qm9/fanet-manual-lse-best-v5.yaml b/configs/exps/icml/qm9/fanet-manual-lse-best-v5.yaml new file mode 100644 index 0000000000..8632b1132b --- /dev/null +++ b/configs/exps/icml/qm9/fanet-manual-lse-best-v5.yaml @@ -0,0 +1,217 @@ +# scheduler reduce lr on plateau +job: + mem: 12GB + cpus: 4 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, fanet-qm9-lse + log_train_every: 200 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type + optim: batch_size, lr_initial + frame_averaging: 3D + fa_frames: random + dataset: + train: + lse_shift: true + val: + lse_shift: true + test: + lse_shift: true + optim: + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + batch_size: 64 + initial_lr: 0.0005 + max_epochs: 1500 + loss_energy: mse + loss_force: mse + # early stopping + es_patience: 20 + es_min_abs_change: 0.00001 + es_warmup_epochs: 500 + # all below is for the ReduceLROnPlateau scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 15 + model: + act: swish + att_heads: 1 + complex_mp: false + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: '' + force_decoder_type: null + graph_norm: true + hidden_channels: 110 + max_num_neighbors: 40 + mp_type: updownscale_base + num_filters: 384 + num_gaussians: 64 + num_interactions: 4 + pg_hidden_channels: 32 + phys_embeds: true + second_layer_MLP: true + skip_co: true + tag_hidden_channels: 0 + + +runs: + - model: + complex_mp: false + cutoff: 5.0 + graph_norm: true + hidden_channels: 110 + max_num_neighbors: 40 + mp_type: updownscale_base + num_filters: 384 + num_gaussians: 64 + num_interactions: 4 + pg_hidden_channels: 32 + phys_embeds: true + second_layer_MLP: true + skip_co: true + + - model: + complex_mp: false + cutoff: 5.0 + graph_norm: true + hidden_channels: 256 + max_num_neighbors: 40 + num_filters: 384 + num_gaussians: 64 + num_interactions: 4 + phys_embeds: true + second_layer_MLP: true + skip_co: true + + - model: + complex_mp: false + cutoff: 5.0 + graph_norm: true + hidden_channels: 256 + max_num_neighbors: 40 + num_filters: 384 + num_gaussians: 64 + num_interactions: 4 + phys_embeds: false + second_layer_MLP: true + skip_co: true + + - model: + complex_mp: false + cutoff: 5.0 + graph_norm: true + hidden_channels: 384 + max_num_neighbors: 40 + num_filters: 384 + num_gaussians: 50 + num_interactions: 4 + phys_embeds: true + second_layer_MLP: true + skip_co: true + + - model: + complex_mp: true + cutoff: 5.0 + graph_norm: true + hidden_channels: 384 + max_num_neighbors: 40 + num_filters: 384 + num_gaussians: 50 + num_interactions: 5 + phys_embeds: true + second_layer_MLP: true + skip_co: true + + - optim: + initial_lr: 0.001 + model: + complex_mp: true + cutoff: 5.0 + graph_norm: true + hidden_channels: 384 + max_num_neighbors: 40 + num_filters: 384 + num_gaussians: 50 + num_interactions: 5 + phys_embeds: true + second_layer_MLP: true + skip_co: true + + - optim: + initial_lr: 0.001 + batch_size: 128 + model: + complex_mp: true + cutoff: 5.0 + graph_norm: true + hidden_channels: 384 + max_num_neighbors: 40 + num_filters: 384 + num_gaussians: 50 + num_interactions: 5 + phys_embeds: true + second_layer_MLP: true + skip_co: true + + - optim: + initial_lr: 0.001 + batch_size: 128 + model: + complex_mp: true + cutoff: 5.0 + graph_norm: true + hidden_channels: 100 + max_num_neighbors: 40 + num_filters: 100 + num_gaussians: 100 + num_interactions: 3 + phys_embeds: true + second_layer_MLP: true + skip_co: true + + - optim: + initial_lr: 0.001 + batch_size: 1024 + model: + complex_mp: true + cutoff: 5.0 + graph_norm: true + hidden_channels: 100 + max_num_neighbors: 40 + num_filters: 100 + num_gaussians: 100 + num_interactions: 3 + phys_embeds: true + second_layer_MLP: true + skip_co: true + + - optim: + initial_lr: 0.001 + batch_size: 128 + model: + complex_mp: true + cutoff: 5.0 + graph_norm: true + hidden_channels: 512 + max_num_neighbors: 40 + num_filters: 256 + num_gaussians: 50 + num_interactions: 3 + phys_embeds: true + second_layer_MLP: true + skip_co: true diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml index b0af9eec5b..3ca3a58048 100644 --- a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml @@ -66,6 +66,7 @@ orion: model/phys_embeds: choices([True, False]) model/second_layer_MLP: choices([True, False]) model/skip_co: choices([True, False]) + model/complex_mp: choices([True, False]) algorithms: asha: seed: 123 diff --git a/configs/exps/qm7x/fanet-lse-v1.yaml b/configs/exps/qm7x/fanet-lse-v1.yaml index 5dd3da7def..f2afd44367 100644 --- a/configs/exps/qm7x/fanet-lse-v1.yaml +++ b/configs/exps/qm7x/fanet-lse-v1.yaml @@ -116,7 +116,16 @@ runs: regress_forces: direct_with_gradient_target graph_norm: true skip_co: true - complex_mlp: true + + - model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + complex_mp: true - model: num_interactions: 4 @@ -126,7 +135,6 @@ runs: regress_forces: direct_with_gradient_target graph_norm: true skip_co: true - complex_mlp: true - model: num_interactions: 4 @@ -136,7 +144,6 @@ runs: regress_forces: direct_with_gradient_target graph_norm: true skip_co: true - complex_mlp: true - model: num_interactions: 4 @@ -146,5 +153,13 @@ runs: regress_forces: direct_with_gradient_target graph_norm: true skip_co: true - complex_mlp: true + - model: + num_interactions: 4 + num_filters: 256 + hidden_channels: 256 + num_gaussians: 20 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + complex_mp: true From 56c8e6e6d9330443092f8842972830e1be15689f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 12:01:45 -0500 Subject: [PATCH 225/273] soft stop script --- configs/exps/qm7x/fanet-lse-v2.yaml | 220 ++++++++++++++++++++++++++ configs/exps/qm7x/fanet-orion-v1.yaml | 0 configs/models/tasks/qm7x.yaml | 1 + ocpmodels/trainers/single_trainer.py | 3 +- scripts/soft_stop_jobs.py | 28 ++++ 5 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 configs/exps/qm7x/fanet-lse-v2.yaml create mode 100644 configs/exps/qm7x/fanet-orion-v1.yaml create mode 100644 scripts/soft_stop_jobs.py diff --git a/configs/exps/qm7x/fanet-lse-v2.yaml b/configs/exps/qm7x/fanet-lse-v2.yaml new file mode 100644 index 0000000000..dedca23aee --- /dev/null +++ b/configs/exps/qm7x/fanet-lse-v2.yaml @@ -0,0 +1,220 @@ +# trainset has 4068193 samples +job: + mem: 32GB + cpus: 4 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + env: ocp-a100 + +default: + config: fanet-qm7x-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces + optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient + energy_head: False + frame_averaging: 3D + fa_frames: random + optim: + batch_size: 100 + max_steps: 2000000 + warmup_steps: 3000 + lr_initial: 0.00025 + eval_every: 0.201 + energy_coefficient: 1 + energy_grad_coefficient: 0 + force_coefficient: 100 + # parameters EMA + ema_decay: 0.999 + loss_energy: mae + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + model: + att_heads: 1 + complex_mp: false + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: false + force_decoder_type: mlp + graph_norm: false + hidden_channels: 128 + max_num_neighbors: 40 + mp_type: updownscale_base + num_filters: 128 + num_gaussians: 20 + num_interactions: 4 + pg_hidden_channels: 32 + phys_embeds: true + regress_forces: direct + second_layer_MLP: false + skip_co: false + tag_hidden_channels: 0 + dataset: + train: + rescale_with_hof: False + lse_shift: True + val_id: + lse_shift: True + test: + lse_shift: True + +runs: + - {} + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 0 + force_coefficient: 100 + model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 6 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + complex_mp: true + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 4 + num_filters: 256 + hidden_channels: 512 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 4 + num_filters: 256 + hidden_channels: 256 + num_gaussians: 100 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 4 + num_filters: 256 + hidden_channels: 256 + num_gaussians: 20 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 3 + num_filters: 512 + hidden_channels: 1024 + num_gaussians: 20 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + + - optim: + energy_coefficient: 1 + energy_grad_coefficient: 10 + force_coefficient: 100 + model: + num_interactions: 4 + num_filters: 256 + hidden_channels: 256 + num_gaussians: 20 + regress_forces: direct_with_gradient_target + graph_norm: true + skip_co: true + complex_mp: true diff --git a/configs/exps/qm7x/fanet-orion-v1.yaml b/configs/exps/qm7x/fanet-orion-v1.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml index 10919390a2..defb898e91 100644 --- a/configs/models/tasks/qm7x.yaml +++ b/configs/models/tasks/qm7x.yaml @@ -1,6 +1,7 @@ default: trainer: single logger: wandb + eval_on_test: True model: otf_graph: False diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index d73a5f2743..289e452f52 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -355,9 +355,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): ): if self.early_stopping_file.exists(): print("\n\n >>> 🛑 Early stopping file found.\n\n") + now = self.now.replace(" ", "_").replace(":", "-") self.early_stopping_file.rename( self.early_stopping_file.parent - / f"{self.early_stopping_file.stem}_{self.now}.txt" + / f"{self.early_stopping_file.stem}_{now}.stopped" ) else: print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n") diff --git a/scripts/soft_stop_jobs.py b/scripts/soft_stop_jobs.py new file mode 100644 index 0000000000..6abae6ae0b --- /dev/null +++ b/scripts/soft_stop_jobs.py @@ -0,0 +1,28 @@ +from minydra import resolved_args +from pathlib import Path +import os +import re + +if __name__ == "__main__": + args = resolved_args() + assert "jobs" in args + jobs = [ + j.strip() + for j in str(args.jobs).replace(",", " ").replace(" ", " ").split(" ") + ] + runs = Path(os.environ["SCRATCH"]) / "ocp" / "runs" + outs = [(runs / j / "output-0.txt") for j in jobs] + confirmed = args.no_confirm or ( + "y" + in input(f"\nAbout to early-stop jobs:\n {', '.join(jobs)}\nContinue? [y/n]: ") + ) + if confirmed: + for out in outs: + if not out.exists(): + print(f"Output file for job {out.parent.name} not found") + continue + stop = re.findall(r"early_stopping_file: (.+)", out.read_text()) + if stop: + Path(stop[0]).touch() + else: + print(f"Early stopping file not found in {str(out)}") From 29776e88d7375cc5a0f4cb5cde75ac0b5120562a Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 12:32:17 -0500 Subject: [PATCH 226/273] update objective logging --- configs/exps/icml/qm7x/fanet-orion-v1.yaml | 108 +++++++++++++++++++++ configs/exps/qm7x/fanet-orion-v1.yaml | 0 ocpmodels/trainers/base_trainer.py | 2 + 3 files changed, 110 insertions(+) create mode 100644 configs/exps/icml/qm7x/fanet-orion-v1.yaml delete mode 100644 configs/exps/qm7x/fanet-orion-v1.yaml diff --git a/configs/exps/icml/qm7x/fanet-orion-v1.yaml b/configs/exps/icml/qm7x/fanet-orion-v1.yaml new file mode 100644 index 0000000000..90e4986367 --- /dev/null +++ b/configs/exps/icml/qm7x/fanet-orion-v1.yaml @@ -0,0 +1,108 @@ +# trainset has 4068193 samples +job: + mem: 32GB + cpus: 5 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + env: ocp-a100 + +default: + config: fanet-qm7x-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces + optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient + log_train_every: 250 + energy_head: False + frame_averaging: 3D + fa_frames: random + optim: + batch_size: 100 + max_steps: 2000000 + warmup_steps: 3000 + lr_initial: 0.00025 + eval_every: 0.201 + energy_coefficient: 1 + energy_grad_coefficient: 0 + force_coefficient: 100 + # parameters EMA + ema_decay: 0.999 + loss_energy: mae + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + model: + att_heads: 1 + complex_mp: false + cutoff: 5.0 + edge_embed_type: all_rij + energy_head: false + force_decoder_type: mlp + graph_norm: false + hidden_channels: 128 + max_num_neighbors: 40 + mp_type: updownscale_base + num_filters: 128 + num_gaussians: 20 + num_interactions: 4 + pg_hidden_channels: 32 + phys_embeds: true + regress_forces: direct + second_layer_MLP: false + skip_co: false + tag_hidden_channels: 0 + dataset: + train: + rescale_with_hof: False + lse_shift: True + val_id: + lse_shift: True + test: + lse_shift: True + orion_mult_factor: + value: 25 + targets: num_filters, num_gaussians, force_coefficient + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 150 + + unique_exp_name: fanet-qm7x-v1.0.0 + + space: + optim/max_steps: fidelity(100000, 2000000, base=2) + optim/lr_initial: loguniform(1e-4, 1e-3, precision=3) + optim/energy_grad_coefficient: uniform(1, 25, discrete=True) + optim/force_coefficient: gaussian(4, 0.5, discrete=True) + + model/complex_mp: choices([True, False]) + model/cutoff: uniform(4.5, 6.5, precision=1) + model/edge_embed_type: all_rij + model/graph_norm: choices([True, False]) + model/hidden_channels: choices([100, 200, 300, 400, 500, 1000]) + model/max_num_neighbors: choices([30, 40, 50]) + model/num_filters: uniform(7, 16, discrete=True) + model/num_gaussians: uniform(1, 5, discrete=True) + model/num_interactions: uniform(3, 7, discrete=True) + model/phys_embeds: choices([True, False]) + model/regress_forces: choices(['direct', 'direct_with_gradient_target']) + model/second_layer_MLP: choices([True, False]) + model/skip_co: choices([True, False]) + + algorithms: + asha: + seed: 123 + num_rungs: 8 + num_brackets: 2 diff --git a/configs/exps/qm7x/fanet-orion-v1.yaml b/configs/exps/qm7x/fanet-orion-v1.yaml deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 2bc1a575c3..b6db9fb25c 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -866,6 +866,8 @@ def eval_all_splits( if self.config["model"].get("regress_forces", False): overall_forces_mae = cumulated_forces_mae / len(all_splits) self.logger.log({"Overall Forces MAE": overall_forces_mae}) + self.objective = (overall_energy_mae + overall_forces_mae) / 2 + self.logger.log({"Objective": self.objective}) # Run on test split if final and "test" in self.config["dataset"] and self.eval_on_test: From cce9213816ca362525a2248d7cd863929b2516c9 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 12:36:34 -0500 Subject: [PATCH 227/273] don't print all orion outputs --- configs/exps/icml/qm7x/fanet-orion-v1.yaml | 4 ++-- launch_exp.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/configs/exps/icml/qm7x/fanet-orion-v1.yaml b/configs/exps/icml/qm7x/fanet-orion-v1.yaml index 90e4986367..9e8bf2e19a 100644 --- a/configs/exps/icml/qm7x/fanet-orion-v1.yaml +++ b/configs/exps/icml/qm7x/fanet-orion-v1.yaml @@ -4,7 +4,7 @@ job: cpus: 5 gres: gpu:16gb:1 partition: long - code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-3 env: ocp-a100 default: @@ -77,7 +77,7 @@ default: orion: # Remember to change the experiment name if you change anything in the search space - n_jobs: 150 + n_jobs: 50 unique_exp_name: fanet-qm7x-v1.0.0 diff --git a/launch_exp.py b/launch_exp.py index 87046a1e6a..e25963210f 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -172,6 +172,7 @@ def get_args_or_exp(key, args, exp): if __name__ == "__main__": is_interrupted = False + n_jobs = None args = resolved_args() assert "exp" in args regex = args.get("match", ".*") @@ -235,7 +236,10 @@ def get_args_or_exp(key, args, exp): commands = [c for c in commands if re.findall(regex, c)] - print(f"🔥 About to run {len(commands)} jobs:\n\n • " + "\n\n • ".join(commands)) + print( + f"🔥 About to run {len(commands)} jobs:\n\n • " + + "\n\n • ".join(commands if n_jobs is None else commands[:1]) + ) separator = "\n" * 4 + f"{'#' * 80}\n" * 4 + "\n" * 4 text = "<><><> Experiment command: $ " + " ".join(["python"] + sys.argv) From effa8916029c23fc226aaccfad184bb20f7f67a2 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 20:20:49 -0500 Subject: [PATCH 228/273] improve `no_confirm` arg --- launch_exp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index e25963210f..4fe5eb1b1c 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -247,9 +247,9 @@ def get_args_or_exp(key, args, exp): text += "\n<><><> Experiment config:\n\n-----" + exp_file.read_text() + "-----" text += "\n<><><> Experiment runs:\n\n • " + "\n\n • ".join(commands) + separator - confirm = args.no_confirm or input("\n🚦 Confirm? [y/n] : ") + confirm = args.no_confirm or "y" in input("\n🚦 Confirm? [y/n] : ") - if confirm == "y": + if confirm: try: if "orion" in exp: search_path.parent.mkdir(exist_ok=True, parents=True) From 5b6b5c21f46e60b4bdf9560c2e99556fcd15e00c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 20:21:00 -0500 Subject: [PATCH 229/273] handle map function --- ocpmodels/common/timer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ocpmodels/common/timer.py b/ocpmodels/common/timer.py index cb38f1c731..d30935fd26 100644 --- a/ocpmodels/common/timer.py +++ b/ocpmodels/common/timer.py @@ -46,7 +46,7 @@ def reset(self): self.times = defaultdict(list) self.timers = {} - def prepare_for_logging(self): + def prepare_for_logging(self, map_func=lambda x: x): """ Computes mean and standard deviation of all timers. Returns a tuple: (mean_times_dict, std_times_dict) @@ -57,8 +57,9 @@ def prepare_for_logging(self): mean_times = {} std_times = {} for k, v in self.times.items(): - mean_times[k] = np.mean(v) - std_times[k] = np.std(v) + data = list(map(map_func, v)) + mean_times[k] = np.mean(data) + std_times[k] = np.std(data) return mean_times, std_times def next(self, name, ignore=None): From 874083c5d53aad512a375ab062d611dc4e2795f2 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 20:21:09 -0500 Subject: [PATCH 230/273] silent of trainer is silent --- ocpmodels/modules/scheduler.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py index ca440b1854..6eba1fd39b 100644 --- a/ocpmodels/modules/scheduler.py +++ b/ocpmodels/modules/scheduler.py @@ -22,10 +22,11 @@ class LRScheduler: optimizer (obj): torch optim object """ - def __init__(self, optimizer, optim_config): + def __init__(self, optimizer, optim_config, silent=False): self.optimizer = optimizer self.optim_config = optim_config.copy() self.warmup_scheduler = None + self.silent = silent if self.optim_config.get("scheduler"): self.scheduler_type = self.optim_config["scheduler"] else: @@ -47,9 +48,11 @@ def scheduler_lambda_fn(x): T_max = self.optim_config.get("fidelity_max_steps") if T_max is None: T_max = self.optim_config["max_steps"] - print(f"Using max_steps for scheduler -> {T_max}") + if not self.silent: + print(f"Using max_steps for scheduler -> {T_max}") else: - print(f"Using fidelity_max_steps for scheduler -> {T_max}") + if not self.silent: + print(f"Using fidelity_max_steps for scheduler -> {T_max}") self.warmup_scheduler = warmup.ExponentialWarmup( self.optimizer, warmup_period=self.optim_config["warmup_steps"] From fb555967e7f2add4abf471a333c82e20a776896c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 20:21:26 -0500 Subject: [PATCH 231/273] rename file --- .../{measure_val_times.py => legacy_phast_measure_val_times.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{measure_val_times.py => legacy_phast_measure_val_times.py} (100%) diff --git a/scripts/measure_val_times.py b/scripts/legacy_phast_measure_val_times.py similarity index 100% rename from scripts/measure_val_times.py rename to scripts/legacy_phast_measure_val_times.py From 50a2467ba89ccbf12948470343f0fe02f7b09d35 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 20:21:36 -0500 Subject: [PATCH 232/273] script to keep running jobs --- scripts/watch_and_run_orion_jobs.py | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 scripts/watch_and_run_orion_jobs.py diff --git a/scripts/watch_and_run_orion_jobs.py b/scripts/watch_and_run_orion_jobs.py new file mode 100644 index 0000000000..c23945c99d --- /dev/null +++ b/scripts/watch_and_run_orion_jobs.py @@ -0,0 +1,39 @@ +import os +from sys import exit +from time import sleep + +from minydra import resolved_args + + +def n_jobs(): + return len(os.popen("squeue -u $USER").read().splitlines()) - 1 + + +if __name__ == "__main__": + args = resolved_args() + assert "exp" in args + hours = args.get("hours", 1) + min_jobs = args.get("min_jobs", 1) + cmd = f"python launch_exp.py exp={args.exp} no_confirm='y'" + "n_jobs={new_jobs}" + print( + f"\nChecking every {hours} hours for new jobs to launch for exp {args.exp}", + f"so that you always have at least {min_jobs} jobs running\n", + ) + + if "y" not in input("Continue? [y/n]: "): + exit() + + i = 0 + + try: + while True: + j = n_jobs() + print(f"\nNumber of jobs at iteration {i}: {j}") + if j < min_jobs: + new_jobs = min_jobs - j + print(f" Launching {new_jobs} jobs at iteration {i}") + os.system(cmd.format(new_jobs=new_jobs)) + i += 1 + sleep(hours * 60 * 60) + except KeyboardInterrupt: + print("Exiting...") From f7f9550dfdc3fd67e166f7dff5f02c7f2afa9e79 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 20:22:11 -0500 Subject: [PATCH 233/273] add silent mode --- ocpmodels/common/utils.py | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 57e91e2944..31185919e9 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -866,7 +866,7 @@ def load_config_legacy(path: str, previous_includes: list = []): return config, duplicates_warning, duplicates_error -def set_cpus_to_workers(config): +def set_cpus_to_workers(config, silent=False): if not config.get("no_cpus_to_workers"): cpus = count_cpus() gpus = count_gpus() @@ -875,7 +875,7 @@ def set_cpus_to_workers(config): workers = cpus - 1 else: workers = cpus // gpus - if not config["silent"]: + if not config["silent"] and not silent: print( f"🏭 Overriding num_workers from {config['optim']['num_workers']}", f"to {workers} to match the machine's CPUs.", @@ -960,10 +960,10 @@ def load_config(config_str): return config -def build_config(args, args_override): +def build_config(args, args_override, silent=False): config = overrides = continue_config = {} - if args.config_yml: + if hasattr(args, "config_yml") and args.config_yml: raise ValueError( "Using LEGACY config format. Please update your config to the new format." ) @@ -1001,18 +1001,22 @@ def build_config(args, args_override): if "orion" in k or "fidelity" in k: dels[k] = copy.deepcopy(continue_config[k]) continue_config[k] = None + if not silent: + print( + "🅾️ Removing orion config from continue config. Set to None:", + "{" + + ", ".join([f"{k}: {v}->None" for k, v in dels.items()]) + + "}", + ) + if not silent: print( - "🅾️ Removing orion config from continue config. Set to None:", - "{" + ", ".join([f"{k}: {v}->None" for k, v in dels.items()]) + "}", - ) - print( - f"✅ Loading config from directory {str(cont_dir)}" - + ( - f" and latest checkpoint: {latest_ckpt}" - if args.continue_from_dir - else " (restarting from scratch)" + f"✅ Loading config from directory {str(cont_dir)}" + + ( + f" and latest checkpoint: {latest_ckpt}" + if args.continue_from_dir + else " (restarting from scratch)" + ) ) - ) args.config = continue_config["config"] if args.config is None: @@ -1075,7 +1079,7 @@ def build_config(args, args_override): config = merge_dicts(config, cli) check_regress_forces(config) - config = set_cpus_to_workers(config) + config = set_cpus_to_workers(config, silent) config = set_qm9_target_stats(config) config = set_qm7x_target_stats(config) config = override_drac_paths(config) From e137faecee94b35326aa2387a81df2f2273f4ed0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 20:22:28 -0500 Subject: [PATCH 234/273] add inference mode in forward --- ocpmodels/common/data_parallel.py | 6 ++-- ocpmodels/models/base_model.py | 25 +++++++------- ocpmodels/trainers/base_trainer.py | 49 ++++++++++++++++------------ ocpmodels/trainers/single_trainer.py | 17 ++++++---- 4 files changed, 56 insertions(+), 41 deletions(-) diff --git a/ocpmodels/common/data_parallel.py b/ocpmodels/common/data_parallel.py index 9c57b6bc71..6f0ceca86b 100644 --- a/ocpmodels/common/data_parallel.py +++ b/ocpmodels/common/data_parallel.py @@ -48,12 +48,12 @@ def __init__(self, module, output_device, num_gpus): output_device=self.src_device, ) - def forward(self, batch_list): + def forward(self, batch_list, **kwargs): if self.cpu: - return self.module(batch_list[0]) + return self.module(batch_list[0], **kwargs) if len(self.device_ids) == 1: - return self.module(batch_list[0].to(f"cuda:{self.device_ids[0]}")) + return self.module(batch_list[0].to(f"cuda:{self.device_ids[0]}"), **kwargs) for t in chain(self.module.parameters(), self.module.buffers()): if t.device != self.src_device: diff --git a/ocpmodels/models/base_model.py b/ocpmodels/models/base_model.py index 3a0ac3a93f..ad769b6c09 100644 --- a/ocpmodels/models/base_model.py +++ b/ocpmodels/models/base_model.py @@ -31,11 +31,12 @@ def energy_forward(self, data): def forces_forward(self, preds): raise NotImplementedError - def forward(self, data): + def forward(self, data, mode="train"): grad_forces = forces = None # energy gradient w.r.t. positions will be computed - data.pos.requires_grad_(True) + if mode == "train" or self.regress_forces == "from_energy": + data.pos.requires_grad_(True) # predict energy preds = self.energy_forward(data) @@ -46,12 +47,13 @@ def forward(self, data): # predict forces forces = self.forces_forward(preds) - if "gemnet" in self.__class__.__name__.lower(): - # gemnet forces are already computed - grad_forces = forces - else: - # compute forces from energy gradient - grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"]) + if mode == "train" or self.regress_forces == "from_energy": + if "gemnet" in self.__class__.__name__.lower(): + # gemnet forces are already computed + grad_forces = forces + else: + # compute forces from energy gradient + grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"]) if self.regress_forces == "from_energy": # predicted forces are the energy gradient @@ -59,9 +61,10 @@ def forward(self, data): elif self.regress_forces in {"direct", "direct_with_gradient_target"}: # predicted forces are the model's direct forces preds["forces"] = forces - # store the energy gradient as the target. Used for metrics - # only in "direct" mode. - preds["forces_grad_target"] = grad_forces.detach() + if mode == "train": + # store the energy gradient as the target. Used for metrics + # only in "direct" mode. + preds["forces_grad_target"] = grad_forces.detach() else: raise ValueError( f"Unknown forces regression mode {self.regress_forces}" diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index b6db9fb25c..9252b2a9af 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -258,19 +258,20 @@ def load_datasets(self): * (n_train / batch_size) ) ) - print( - "Setting fidelity_max_steps to {}".format( - self.config["optim"]["fidelity_max_steps"] + if not self.silent: + print( + "Setting fidelity_max_steps to {}".format( + self.config["optim"]["fidelity_max_steps"] + ) ) - ) if max_samples > 0: - if max_epochs > 0: + if max_epochs > 0 and not self.silent: print( "\nWARNING: Both max_samples and max_epochs are set.", "Using max_samples.", ) - if max_steps > 0: + if max_steps > 0 and not self.silent: print( "WARNING: Both max_samples and max_steps are set.", "Using max_samples.\n", @@ -282,7 +283,7 @@ def load_datasets(self): np.ceil(max_samples / batch_size) ) elif max_steps > 0: - if max_epochs > 0: + if max_epochs > 0 and not self.silent: print( "\nWARNING: Both max_steps and max_epochs are set.", "Using max_steps.\n", @@ -290,23 +291,25 @@ def load_datasets(self): self.config["optim"]["max_epochs"] = int( np.ceil(max_steps / (n_train / batch_size)) ) - print( - "Setting max_epochs to", - self.config["optim"]["max_epochs"], - f"from max_steps ({max_steps}),", - f"dataset length ({n_train}),", - f"and batch_size ({batch_size})\n", - ) + if not self.silent: + print( + "Setting max_epochs to", + self.config["optim"]["max_epochs"], + f"from max_steps ({max_steps}),", + f"dataset length ({n_train}),", + f"and batch_size ({batch_size})\n", + ) else: self.config["optim"]["max_steps"] = int( np.ceil(max_epochs * (n_train / batch_size)) ) - print( - "Setting max_steps to ", - f"{self.config['optim']['max_steps']} from", - f"max_epochs ({max_epochs}), dataset length", - f"({n_train}), and batch_size ({batch_size})\n", - ) + if not self.silent: + print( + "Setting max_steps to ", + f"{self.config['optim']['max_steps']} from", + f"max_epochs ({max_epochs}), dataset length", + f"({n_train}), and batch_size ({batch_size})\n", + ) self.samplers[split] = self.get_sampler( self.datasets[split], batch_size, shuffle=shuffle @@ -498,7 +501,11 @@ def load_optimizer(self): ) def load_extras(self): - self.scheduler = LRScheduler(self.optimizer, self.config["optim"]) + self.scheduler = LRScheduler( + self.optimizer, + self.config["optim"], + silent=self.silent, + ) self.clip_grad_norm = self.config["optim"].get("clip_grad_norm") self.ema_decay = self.config["optim"].get("ema_decay") if self.ema_decay: diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 289e452f52..d5a3d2a197 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -78,10 +78,11 @@ def load_task(self): device=self.device, ) else: - print( - "Warning: grad_target_mean not found in normalizer but", - "regress_forces and normalize_labels are true.", - ) + if not self.silent: + print( + "Warning: grad_target_mean not found in normalizer but", + "regress_forces and normalize_labels are true.", + ) self.normalizers["grad_target"] = Normalizer( tensor=self.datasets["train"].data.y[ self.datasets["train"].__indices__ @@ -431,7 +432,7 @@ def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times) for ds in self.datasets.values(): ds.close_db() - def model_forward(self, batch_list): + def model_forward(self, batch_list, mode="train"): # Distinguish frame averaging from base case. if self.config["frame_averaging"] and self.config["frame_averaging"] != "DA": original_pos = batch_list[0].pos @@ -444,7 +445,10 @@ def model_forward(self, batch_list): batch_list[0].pos = batch_list[0].fa_pos[i] if self.task_name in OCP_TASKS: batch_list[0].cell = batch_list[0].fa_cell[i] - preds = self.model(deepcopy(batch_list)) + + # forward pass + preds = self.model(deepcopy(batch_list), mode=mode) + e_all.append(preds["energy"]) if preds.get("pooling_loss") is not None: p_all.append(preds["pooling_loss"]) @@ -460,6 +464,7 @@ def model_forward(self, batch_list): .view(-1, 3) ) f_all.append(g_forces) + batch_list[0].pos = original_pos if self.task_name in OCP_TASKS: batch_list[0].cell = original_cell From bf406946709e0eb72cfcaebd4754bb3e85ad3709 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Sun, 22 Jan 2023 20:39:31 -0500 Subject: [PATCH 235/273] update force coefficient search space --- configs/exps/icml/qm7x/fanet-orion-v1.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/configs/exps/icml/qm7x/fanet-orion-v1.yaml b/configs/exps/icml/qm7x/fanet-orion-v1.yaml index 9e8bf2e19a..d7df89b3e6 100644 --- a/configs/exps/icml/qm7x/fanet-orion-v1.yaml +++ b/configs/exps/icml/qm7x/fanet-orion-v1.yaml @@ -79,17 +79,16 @@ orion: # Remember to change the experiment name if you change anything in the search space n_jobs: 50 - unique_exp_name: fanet-qm7x-v1.0.0 + unique_exp_name: fanet-qm7x-v1.0.1 space: optim/max_steps: fidelity(100000, 2000000, base=2) optim/lr_initial: loguniform(1e-4, 1e-3, precision=3) optim/energy_grad_coefficient: uniform(1, 25, discrete=True) - optim/force_coefficient: gaussian(4, 0.5, discrete=True) + optim/force_coefficient: uniform(3, 5, discrete=True) model/complex_mp: choices([True, False]) model/cutoff: uniform(4.5, 6.5, precision=1) - model/edge_embed_type: all_rij model/graph_norm: choices([True, False]) model/hidden_channels: choices([100, 200, 300, 400, 500, 1000]) model/max_num_neighbors: choices([30, 40, 50]) From feb61c4c3e2f49cd76f833bddd9b82c30a051ef7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 23 Jan 2023 10:20:26 -0500 Subject: [PATCH 236/273] inference timing script --- scripts/measure_val_inference_time.py | 111 ++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 scripts/measure_val_inference_time.py diff --git a/scripts/measure_val_inference_time.py b/scripts/measure_val_inference_time.py new file mode 100644 index 0000000000..c17fb5a9e3 --- /dev/null +++ b/scripts/measure_val_inference_time.py @@ -0,0 +1,111 @@ +import copy +import sys +from argparse import Namespace +from pathlib import Path + +import torch +from minydra import resolved_args +from tqdm import tqdm + +sys.path.append(str(Path(__file__).resolve().parent.parent)) + +from ocpmodels.common.timer import Times +from ocpmodels.common.utils import ( + build_config, + move_lmdb_data_to_slurm_tmpdir, + resolve, + setup_imports, +) +from ocpmodels.trainers.single_trainer import SingleTrainer + +if __name__ == "__main__": + args = resolved_args( + defaults={ + "base_path": "$SCRATCH/ocp/runs", + "n_loops": 1, + "others": "", + "job_ids": "", + }, + ).pretty_print() + base = resolve(args.base_path) + job_ids = [j.strip() for j in str(args.job_ids).split(",")] + paths = [Path(base) / j for j in job_ids if j] + [ + resolve(p.strip()) for p in args.others.split(",") + ] + run_dir = resolve("$SCRATCH/ocp/inference_time") + + setup_imports() + + torch.set_grad_enabled(False) + + conf_args = [ + Namespace( + restart_from_dir=str(p), + continue_from_dir=None, + keep_orion_config=False, + run_dir=run_dir / "-".join(job_ids), + num_nodes=1, + num_gpus=1, + ) + for p in paths + ] + configs = [ + build_config(ca, [], silent=True) + for ca in tqdm(conf_args, desc="Loading configs".ljust(40)) + ] + configs = [(l, config) for config in configs for l in range(args.n_loops)] + names = [ + f'{config["restart_from_dir"].name}-{config["config"]}' for _, config in configs + ] + + times = {} + + for k, (l, config) in enumerate( + tqdm( + configs, + desc=f"Timing {args.n_loops}x{len(conf_args)}={len(configs)} configs".ljust( + 40 + ), + ) + ): + config["logger"] = "dummy" + config["silent"] = True + + od = copy.deepcopy(config["dataset"]) + for split in od: + if split != "default_val" and split != config["dataset"]["default_val"]: + del config["dataset"][split] + config = move_lmdb_data_to_slurm_tmpdir(config) + for split in od: + if split != "default_val" and split != config["dataset"]["default_val"]: + config["dataset"][split] = od[split] + + if l == 0: + trainer = SingleTrainer(**config) + timer = Times(gpu=True) + + name = names[k] + + for i, b in enumerate( + tqdm( + trainer.loaders[trainer.config["dataset"]["default_val"]], + desc=f"{name} (loop {l+1}/{args.n_loops})".ljust(40), + leave=False, + ) + ): + with torch.cuda.amp.autocast(enabled=trainer.scaler is not None): + with timer.next("forward"): + _ = trainer.model_forward(b, mode="inference") + + if l == args.n_loops - 1: + mean, std = timer.prepare_for_logging( + map_func=lambda t: t / trainer.config["optim"]["batch_size"] + ) + times[name] = mean["forward"] + + print( + " • " + + "\n • ".join( + f"{k}: {v:.6f} s / sample = {1/v:.2f} samples / s" for k, v in times.items() + ) + ) From f3bdf8b36ab8ceb0fc9de02a00ed8f2eb9cb2e1d Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 23 Jan 2023 12:49:40 -0500 Subject: [PATCH 237/273] fix eval_on_test --- ocpmodels/trainers/base_trainer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 9252b2a9af..1c02706b49 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -78,10 +78,7 @@ def __init__(self, **kwargs): self.test_ri = self.config["test_ri"] self.is_debug = self.config["is_debug"] self.is_hpo = self.config["is_hpo"] - if self.task_name == "qm9": - self.eval_on_test = self.config["eval_on_test"] - else: - self.eval_on_test = False + self.eval_on_test = bool(self.config.get("eval_on_test")) self.silent = self.config["silent"] self.datasets = {} self.samplers = {} From cd73b6fb6627596f3ef79b1611eaeae2ff90db72 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 23 Jan 2023 12:59:14 -0500 Subject: [PATCH 238/273] fix checkpoint continue --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 31185919e9..b2867addf6 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -992,9 +992,9 @@ def build_config(args, args_override, silent=False): latest_ckpt = str( sorted(ckpts, key=lambda c: float(c.stem.split("-")[-1]))[-1] ) + continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] if args.continue_from_dir: continue_config["checkpoint"] = str(latest_ckpt) - continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] if not args.keep_orion_config: dels = {} for k in continue_config: From cb7a488c301a496f9a3a432ce65d69144bfe57a0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 23 Jan 2023 16:40:53 -0500 Subject: [PATCH 239/273] remove detect-anomaly --- ocpmodels/models/forcenet.py | 1 - ocpmodels/models/sfarinet.py | 1 - 2 files changed, 2 deletions(-) diff --git a/ocpmodels/models/forcenet.py b/ocpmodels/models/forcenet.py index ebb9ae4bd1..2a47327923 100644 --- a/ocpmodels/models/forcenet.py +++ b/ocpmodels/models/forcenet.py @@ -250,7 +250,6 @@ class ForceNet(BaseModel): def __init__(self, **kwargs): super(ForceNet, self).__init__() - torch.autograd.set_detect_anomaly(True) self.ablation = kwargs["ablation"] self.basis = kwargs["basis"] self.cutoff = kwargs["cutoff"] diff --git a/ocpmodels/models/sfarinet.py b/ocpmodels/models/sfarinet.py index da3ad2a985..1833b9a2c4 100644 --- a/ocpmodels/models/sfarinet.py +++ b/ocpmodels/models/sfarinet.py @@ -329,7 +329,6 @@ class SfariNet(BaseModel): def __init__(self, **kwargs): super().__init__() - torch.autograd.set_detect_anomaly(True) self.cutoff = kwargs["cutoff"] self.use_pbc = kwargs["use_pbc"] self.max_num_neighbors = kwargs["max_num_neighbors"] From 551419c7ce862ab0b33d0d123d6647acdc141117 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 23 Jan 2023 17:48:06 -0500 Subject: [PATCH 240/273] auto long-grace for orion long jobs --- launch_exp.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/launch_exp.py b/launch_exp.py index 4fe5eb1b1c..ca1d5fe964 100644 --- a/launch_exp.py +++ b/launch_exp.py @@ -195,12 +195,17 @@ def get_args_or_exp(key, args, exp): exp["unique_exp_name"] = unique_exp_name search_path = orion_base / "search-spaces" / f"{ts}-{unique_exp_name}.yaml" + job_dict = { + "job_name": unique_exp_name, + } + + if (exp["job"].get("partition") or "long") == "long": + job_dict["partition"] = "long-grace" + runs = [ { "orion_exp_config_path": str(search_path), - "job": { - "job_name": unique_exp_name, - }, + "job": job_dict, } for _ in range(n_jobs) ] From a9a98c5d3c03af7dfd48a4ae62f4d1ac779865a0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Mon, 23 Jan 2023 17:52:37 -0500 Subject: [PATCH 241/273] v2 --- .../exps/icml/s2ef/fanet-orion-s2ef-2.yaml | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml diff --git a/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml b/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml new file mode 100644 index 0000000000..5cab52d455 --- /dev/null +++ b/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml @@ -0,0 +1,64 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 32GB + cpus: 4 + gres: gpu:1 + partition: long + +default: + wandb_project: ocp-3 + config: fanet-s2ef-2M + mode: train + test_ri: true + wandb_tags: s2ef-2M, orion + cp_data_to_tmpdir: true + graph_rewiring: remove-tag-0 + model: + edge_embed_type: all_rij + frame_averaging: 2D + fa_frames: random + optim: + scheduler: LinearWarmupCosineAnnealingLR + force_coefficient: 100 + energy_coefficient: 1 + energy_grad_coefficient: 5 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co + optim: lr_initial, warmup_steps + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 50 + + unique_exp_name: fanet-s2ef-2M-v1.2 + + space: + model/att_heads: choices([1,2,3,4]) + model/complex_mp: choices([True, False]) + model/cutoff: choices([4.0, 6.0, 10.0]) + model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"]) + model/graph_norm: choices([True, False]) + model/hidden_channels: uniform(6, 22, discrete=True) + model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"]) + model/num_filters: uniform(2, 18, discrete=True) + model/num_gaussians: uniform(30, 150, discrete=True) + model/num_interactions: uniform(3, 6, discrete=True) + model/pg_hidden_channels: uniform(0, 1, discrete=True) + model/phys_embeds: choices([True, False]) + model/regress_forces: choices(["direct_with_gradient_target", "direct"]) + model/second_layer_MLP: choices([True, False]) + model/skip_co: choices(["add", "concat", False]) + model/tag_hidden_channels: uniform(0, 2, discrete=True) + model/max_num_neighbors: choices([30,40,50]) + optim/lr_initial: loguniform(5e-5, 5e-4, precision=2) + optim/max_epochs: fidelity(6, 22, base=6) + + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 From aeebbaec6c76af0f1359d5f4303ec96f28caf2a5 Mon Sep 17 00:00:00 2001 From: AlexDuvalinho Date: Mon, 23 Jan 2023 18:06:51 -0500 Subject: [PATCH 242/273] new confgis --- .../exps/icml/is2re-all/fanet-orion-4.yaml | 2 +- configs/exps/icml/is2re-all/top-config-3.yaml | 246 ++++++++++++++ .../exps/icml/s2ef/fanet-orion-s2ef-2.yaml | 65 ++++ configs/exps/icml/s2ef/top-config.yaml | 300 ++++++++++++++++++ configs/models/tasks/s2ef.yaml | 18 +- 5 files changed, 621 insertions(+), 10 deletions(-) create mode 100644 configs/exps/icml/is2re-all/top-config-3.yaml create mode 100644 configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml create mode 100644 configs/exps/icml/s2ef/top-config.yaml diff --git a/configs/exps/icml/is2re-all/fanet-orion-4.yaml b/configs/exps/icml/is2re-all/fanet-orion-4.yaml index f86ea559f4..dd1c46f035 100644 --- a/configs/exps/icml/is2re-all/fanet-orion-4.yaml +++ b/configs/exps/icml/is2re-all/fanet-orion-4.yaml @@ -31,7 +31,7 @@ default: orion: # Remember to change the experiment name if you change anything in the search space - n_jobs: 150 + n_jobs: 25 unique_exp_name: fanet-is2re-all-v4 diff --git a/configs/exps/icml/is2re-all/top-config-3.yaml b/configs/exps/icml/is2re-all/top-config-3.yaml new file mode 100644 index 0000000000..eb18bc2f77 --- /dev/null +++ b/configs/exps/icml/is2re-all/top-config-3.yaml @@ -0,0 +1,246 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + time: 15:00:00 + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + model: + edge_embed_type: all_rij + wandb_tags: 'best-config' + optim: + batch_size: 256 + eval_batch_size: 256 + cp_data_to_tmpdir: True + +runs: + + - config: fanet-is2re-all # 2700544 + note: 'top-run' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 10 + eval_every: 0.25 + + - config: fanet-is2re-all # 2700544 + note: 'top-run eval every epoch' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 12 + eval_every: 1 + + - config: fanet-is2re-all # 2700544 + note: 'top-run eval every epoch' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 12 + eval_every: 1 + + - config: fanet-is2re-all # 2700544 + note: 'top-run' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 14 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-run' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 380 + num_gaussians: 80 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 12 + eval_every: 0.25 + + - config: fanet-is2re-all # 2700544 + note: 'top-run' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 6 + second_layer_MLP: False + skip_co: concat + cutoff: 5.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 10 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-run' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 70 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: True + skip_co: concat + cutoff: 5.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 11 + eval_every: 0.4 + + - config: fanet-is2re-all # 2700544 + note: 'top-run' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 288 + num_gaussians: 68 + num_interactions: 5 + second_layer_MLP: False + skip_co: concat + cutoff: 4.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 9 + eval_every: 0.4 + + + - config: fanet-is2re-all # 2700544 + note: 'top-run' + frame_averaging: 2D + fa_frames: se3-random + model: + mp_type: updownscale_base + phys_embeds: True + tag_hidden_channels: 32 + pg_hidden_channels: 96 + energy_head: weighted-av-final-embeds + complex_mp: True + graph_norm: True + hidden_channels: 352 + num_filters: 300 + num_gaussians: 75 + num_interactions: 6 + second_layer_MLP: False + skip_co: concat + cutoff: 5.0 + optim: + lr_initial: 0.002 + scheduler: LinearWarmupCosineAnnealingLR + max_epochs: 13 + eval_every: 0.4 \ No newline at end of file diff --git a/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml b/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml new file mode 100644 index 0000000000..7a4b3da0fd --- /dev/null +++ b/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml @@ -0,0 +1,65 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + +default: + wandb_project: ocp-3 + config: fanet-s2ef-2M + mode: train + test_ri: true + wandb_tags: s2ef-2M, orion, v2 + cp_data_to_tmpdir: true + graph_rewiring: remove-tag-0 + model: + edge_embed_type: all_rij + graph_norm: True + frame_averaging: 2D + fa_frames: random + optim: + scheduler: LinearWarmupCosineAnnealingLR + force_coefficient: 100 + energy_coefficient: 1 + energy_grad_coefficient: 5 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co + optim: lr_initial, warmup_steps + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 32 + targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 40 + + unique_exp_name: fanet-s2ef-2M-v2 + + space: + model/att_heads: choices([1,2,3,4]) + model/complex_mp: choices([True, False]) + model/cutoff: choices([4.0, 6.0, 8.0]) + model/energy_head: choices(["weighted-av-final-embeds"]) + model/graph_norm: choices([True, False]) + model/hidden_channels: uniform(6, 21, discrete=True) + model/mp_type: choices(["simple", "base", "updownscale", "updownscale_base", "base_with_att", "updown_local_env"]) + model/num_filters: uniform(3, 18, discrete=True) + model/num_gaussians: uniform(40, 160, discrete=True) + model/num_interactions: uniform(3, 7, discrete=True) + model/pg_hidden_channels: uniform(0, 3, discrete=True) + model/phys_embeds: choices([True, False]) + model/regress_forces: choices(["direct_with_gradient_target", "direct"]) + model/second_layer_MLP: choices([True, False]) + model/skip_co: choices(["concat", False]) + model/tag_hidden_channels: uniform(0, 2, discrete=True) + model/max_num_neighbors: choices([30,40,50]) + optim/lr_initial: loguniform(9e-5, 5e-4, precision=2) + optim/max_epochs: fidelity(8, 22, base=6) + + algorithms: + asha: + seed: 123 + num_rungs: 4 + num_brackets: 2 diff --git a/configs/exps/icml/s2ef/top-config.yaml b/configs/exps/icml/s2ef/top-config.yaml new file mode 100644 index 0000000000..4dc6f04800 --- /dev/null +++ b/configs/exps/icml/s2ef/top-config.yaml @@ -0,0 +1,300 @@ +job: + mem: 32GB + cpus: 4 + gres: gpu:rtx8000:1 + partition: long + +default: + test_ri: True + mode: train + graph_rewiring: remove-tag-0 + cp_data_to_tmp: true + model: + energy_head: 'weighted-av-final-embeds' # False ? frame_averaging: 2D + fa_frames: random + frame_averaging: 2D + wandb_tags: 's2ef-top-config' + optim: + batch_size: 192 + eval_batch_size: 192 + scheduler: LinearWarmupCosineAnnealingLR + force_coefficient: 100 + energy_coefficient: 1 + energy_grad_coefficient: 5 + +runs: + - config: sfarinet-s2ef-2M + note: 'top-config' + fa_fames: random + optim: + lr_initial: 0.00022 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 40 + graph_norm: False + mp_type: updownscale + hidden_channels: 480 + num_gaussians: 145 + num_filters: 384 + num_interactions: 4 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: False + skip_co: "concat" + second_layer_mlp: False + complex_mp: True + second_layer_MLP: True + + - config: sfarinet-s2ef-2M + note: 'DA' + frame_averaging: DA + optim: + lr_initial: 0.00022 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 40 + graph_norm: False + mp_type: updownscale + hidden_channels: 480 + num_gaussians: 145 + num_filters: 384 + num_interactions: 4 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: False + skip_co: "concat" + complex_mp: False + second_layer_MLP: True + + - config: sfarinet-s2ef-2M + note: 'bigger se3-random' + frame_averaging: 2D + fa_frames: se3-random + optim: + lr_initial: 0.00022 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 40 + graph_norm: True + mp_type: updownscale + hidden_channels: 480 + num_gaussians: 145 + num_filters: 420 + num_interactions: 4 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: True + skip_co: "concat" + complex_mp: True + second_layer_MLP: True + + - config: sfarinet-s2ef-2M + note: 'force only' + frame_averaging: 2D + fa_frames: se3-random + optim: + lr_initial: 0.00022 + max_epochs: 22 + force_coefficient: 100 + energy_coefficient: 0 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 40 + graph_norm: False + mp_type: updownscale + hidden_channels: 480 + num_gaussians: 145 + num_filters: 420 + num_interactions: 4 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: True + skip_co: "concat" + complex_mp: True + second_layer_MLP: True + + - config: sfarinet-s2ef-2M + note: 'top-config' + frame_averaging: 2D + fa_frames: random + optim: + lr_initial: 0.00022 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 30 + graph_norm: True + mp_type: updownscale_base + hidden_channels: 256 + num_gaussians: 128 + num_filters: 480 + num_interactions: 7 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: True + skip_co: "concat" + complex_mp: True + second_layer_MLP: False + + - config: sfarinet-s2ef-2M + note: 'top-config' + frame_averaging: 2D + fa_frames: random + optim: + lr_initial: 0.00027 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 30 + graph_norm: True + mp_type: updownscale_base + hidden_channels: 456 + num_gaussians: 128 + num_filters: 600 + num_interactions: 7 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: True + skip_co: "concat" + complex_mp: True + second_layer_MLP: False + + - config: sfarinet-s2ef-2M + note: 'top-config' + frame_averaging: 2D + fa_frames: random + optim: + lr_initial: 0.00027 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 30 + graph_norm: True + mp_type: updownscale_base + hidden_channels: 456 + num_gaussians: 128 + num_filters: 600 + num_interactions: 7 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: True + skip_co: "concat" + complex_mp: True + second_layer_MLP: False + + + - config: sfarinet-s2ef-2M + note: 'top-config' + frame_averaging: 2D + fa_frames: random + optim: + lr_initial: 0.00023 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 50 + graph_norm: True + mp_type: base + hidden_channels: 352 + num_gaussians: 99 + num_filters: 480 + num_interactions: 4 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: False + skip_co: "concat" + complex_mp: False + second_layer_MLP: False + + - config: sfarinet-s2ef-2M + note: '' + frame_averaging: 2D + fa_frames: random + optim: + lr_initial: 0.00023 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct + cutoff: 6 + max_num_neighbors: 50 + graph_norm: True + mp_type: base + hidden_channels: 352 + num_gaussians: 99 + num_filters: 480 + num_interactions: 4 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: False + skip_co: "concat" + complex_mp: False + second_layer_MLP: False + + - config: sfarinet-s2ef-2M + note: 'all' + frame_averaging: 2D + fa_frames: all + optim: + lr_initial: 0.00023 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: direct_with_gradient_target + cutoff: 6 + max_num_neighbors: 50 + graph_norm: True + mp_type: base + hidden_channels: 352 + num_gaussians: 99 + num_filters: 480 + num_interactions: 4 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: False + skip_co: "concat" + complex_mp: True + second_layer_MLP: True + + - config: sfarinet-s2ef-2M + note: 'from_energy' + frame_averaging: 2D + fa_frames: random + optim: + lr_initial: 0.00023 + max_epochs: 22 + force_coefficient: 100 + model: + regress_forces: from_energy + cutoff: 6 + max_num_neighbors: 30 + graph_norm: True + mp_type: base + hidden_channels: 352 + num_gaussians: 99 + num_filters: 480 + num_interactions: 4 + pg_hidden_channels: 64 + tag_hidden_channels: 64 + phys_embeds: False + skip_co: "concat" + complex_mp: False + second_layer_MLP: True \ No newline at end of file diff --git a/configs/models/tasks/s2ef.yaml b/configs/models/tasks/s2ef.yaml index 4916788b07..ef62591945 100644 --- a/configs/models/tasks/s2ef.yaml +++ b/configs/models/tasks/s2ef.yaml @@ -22,37 +22,37 @@ default: dataset: default_val: val_id train: - src: /network/projects/_groups/ocp/oc20/s2ef/200k/train + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/200k/train/ normalize_labels: True target_mean: -0.7554450631141663 target_std: 2.887317180633545 grad_target_mean: 0.0 grad_target_std: 2.887317180633545 val_id: - src: /network/projects/_groups/ocp/oc20/s2ef/all/val_id + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/val_id val_ood_cat: - src: /network/projects/_groups/ocp/oc20/s2ef/all/val_ood_cat + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/val_ood_cat val_ood_ads: - src: /network/projects/_groups/ocp/oc20/s2ef/all/val_ood_ads + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/val_ood_ads val_ood_both: - src: /network/projects/_groups/ocp/oc20/s2ef/all/val_ood_both + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/val_ood_both 200k: dataset: train: - src: /network/projects/_groups/ocp/oc20/s2ef/200k/train + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/200k/train/ 2M: dataset: train: - src: /network/projects/_groups/ocp/oc20/s2ef/2M/train/ + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/2M/train/ 20M: dataset: train: - src: /network/projects/_groups/ocp/oc20/s2ef/20M/train/ + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/20M/train/ all: dataset: train: - src: /network/projects/_groups/ocp/oc20/s2ef/all/train/ + src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/train/ From 5c35196b02d5959f87d8b06f87421eb79ebeb9ba Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 11:20:28 -0500 Subject: [PATCH 243/273] `#SBATCH --tmp=800GB` --- .../icml/qm9/fanet-best-v6-all-targets.yaml | 379 ++++++++++++++++++ sbatch.py | 1 + 2 files changed, 380 insertions(+) create mode 100644 configs/exps/icml/qm9/fanet-best-v6-all-targets.yaml diff --git a/configs/exps/icml/qm9/fanet-best-v6-all-targets.yaml b/configs/exps/icml/qm9/fanet-best-v6-all-targets.yaml new file mode 100644 index 0000000000..661a007a73 --- /dev/null +++ b/configs/exps/icml/qm9/fanet-best-v6-all-targets.yaml @@ -0,0 +1,379 @@ +# scheduler reduce lr on plateau +job: + mem: 12GB + cpus: 4 + gres: gpu:16gb:1 + partition: long + code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2 + # dev: true + # verbose: true + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, fanet-qm9-lse + log_train_every: 200 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type + optim: batch_size, lr_initial + frame_averaging: 3D + fa_frames: random + dataset: + train: + lse_shift: true + val: + lse_shift: true + test: + lse_shift: true + optim: + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + batch_size: 64 + initial_lr: 0.0003 + max_epochs: 1500 + loss_energy: mae + # early stopping + es_patience: 20 + es_min_abs_change: 0.000001 + es_warmup_epochs: 600 + # all below is for the ReduceLROnPlateau scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.9 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 15 + model: + complex_mp: true + cutoff: 6.0 + edge_embed_type: all_rij + energy_head: "" + graph_norm: True + hidden_channels: 400 + max_num_neighbors: 30 + mp_type: updownscale_base + num_filters: 480 + num_gaussians: 100 + num_interactions: 5 + otf_graph: false + pg_hidden_channels: 32 + phys_embeds: false + phys_hidden_channels: 0 + regress_forces: "" + second_layer_MLP: true + skip_co: true + tag_hidden_channels: 0 + use_pbc: false + + +runs: + - dataset: + train: + target: 0 + val: + target: 0 + test: + target: 0 + - dataset: + train: + target: 1 + val: + target: 1 + test: + target: 1 + - dataset: + train: + target: 2 + val: + target: 2 + test: + target: 2 + - dataset: + train: + target: 3 + val: + target: 3 + test: + target: 3 + - dataset: + train: + target: 4 + val: + target: 4 + test: + target: 4 + - dataset: + train: + target: 5 + val: + target: 5 + test: + target: 5 + - dataset: + train: + target: 6 + val: + target: 6 + test: + target: 6 + - dataset: + train: + target: 7 + val: + target: 7 + test: + target: 7 + - dataset: + train: + target: 8 + val: + target: 8 + test: + target: 8 + - dataset: + train: + target: 9 + val: + target: 9 + test: + target: 9 + - dataset: + train: + target: 10 + val: + target: 10 + test: + target: 10 + - dataset: + train: + target: 11 + val: + target: 11 + test: + target: 11 + - dataset: + train: + target: 12 + val: + target: 12 + test: + target: 12 + - dataset: + train: + target: 13 + val: + target: 13 + test: + target: 13 + - dataset: + train: + target: 14 + val: + target: 14 + test: + target: 14 + - dataset: + train: + target: 15 + val: + target: 15 + test: + target: 15 + - dataset: + train: + target: 16 + val: + target: 16 + test: + target: 16 + - dataset: + train: + target: 17 + val: + target: 17 + test: + target: 17 + - dataset: + train: + target: 18 + val: + target: 18 + test: + target: 18 + + - optim: + loss_energy: mse + dataset: + train: + target: 0 + val: + target: 0 + test: + target: 0 + - optim: + loss_energy: mse + dataset: + train: + target: 1 + val: + target: 1 + test: + target: 1 + - optim: + loss_energy: mse + dataset: + train: + target: 2 + val: + target: 2 + test: + target: 2 + - optim: + loss_energy: mse + dataset: + train: + target: 3 + val: + target: 3 + test: + target: 3 + - optim: + loss_energy: mse + dataset: + train: + target: 4 + val: + target: 4 + test: + target: 4 + - optim: + loss_energy: mse + dataset: + train: + target: 5 + val: + target: 5 + test: + target: 5 + - optim: + loss_energy: mse + dataset: + train: + target: 6 + val: + target: 6 + test: + target: 6 + - optim: + loss_energy: mse + dataset: + train: + target: 7 + val: + target: 7 + test: + target: 7 + - optim: + loss_energy: mse + dataset: + train: + target: 8 + val: + target: 8 + test: + target: 8 + - optim: + loss_energy: mse + dataset: + train: + target: 9 + val: + target: 9 + test: + target: 9 + - optim: + loss_energy: mse + dataset: + train: + target: 10 + val: + target: 10 + test: + target: 10 + - optim: + loss_energy: mse + dataset: + train: + target: 11 + val: + target: 11 + test: + target: 11 + - optim: + loss_energy: mse + dataset: + train: + target: 12 + val: + target: 12 + test: + target: 12 + - optim: + loss_energy: mse + dataset: + train: + target: 13 + val: + target: 13 + test: + target: 13 + - optim: + loss_energy: mse + dataset: + train: + target: 14 + val: + target: 14 + test: + target: 14 + - optim: + loss_energy: mse + dataset: + train: + target: 15 + val: + target: 15 + test: + target: 15 + - optim: + loss_energy: mse + dataset: + train: + target: 16 + val: + target: 16 + test: + target: 16 + - optim: + loss_energy: mse + dataset: + train: + target: 17 + val: + target: 17 + test: + target: 17 + - optim: + loss_energy: mse + dataset: + train: + target: 18 + val: + target: 18 + test: + target: 18 diff --git a/sbatch.py b/sbatch.py index b7a4d174d7..6d35dfcdb6 100644 --- a/sbatch.py +++ b/sbatch.py @@ -58,6 +58,7 @@ def make_sbatch_params(params): for k, v in params.items(): if v: sps.append(f"#SBATCH --{k}={v}") + sps.append("#SBATCH --tmp=800GB") return "\n".join(sps) + "\n" From 22ea5fc8ebb8f6a3ec52bc47573e99f058baa8c6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:03:27 -0500 Subject: [PATCH 244/273] update qm7x drac paths --- configs/models/tasks/_drac.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/configs/models/tasks/_drac.yaml b/configs/models/tasks/_drac.yaml index bbfa6a9847..ac233b5bf1 100644 --- a/configs/models/tasks/_drac.yaml +++ b/configs/models/tasks/_drac.yaml @@ -95,3 +95,12 @@ qm9: src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 test: src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9 + +qm7x: + all: + train: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed + val: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed + test: + src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed From 5ab284d1886febc84aa345d0a90cc3217e9d1d5f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:07:26 -0500 Subject: [PATCH 245/273] fix val_id key --- configs/models/tasks/_drac.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/models/tasks/_drac.yaml b/configs/models/tasks/_drac.yaml index ac233b5bf1..d104f5f5b4 100644 --- a/configs/models/tasks/_drac.yaml +++ b/configs/models/tasks/_drac.yaml @@ -100,7 +100,7 @@ qm7x: all: train: src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed - val: + val_id: src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed test: src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed From 64a61da5112a23b30ffa6bbfc187749a53d88832 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:18:40 -0500 Subject: [PATCH 246/273] narval dpp qm7x --- configs/exps/icml/qm7x/dpp-v1.yaml | 80 ++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 configs/exps/icml/qm7x/dpp-v1.yaml diff --git a/configs/exps/icml/qm7x/dpp-v1.yaml b/configs/exps/icml/qm7x/dpp-v1.yaml new file mode 100644 index 0000000000..dc663d6870 --- /dev/null +++ b/configs/exps/icml/qm7x/dpp-v1.yaml @@ -0,0 +1,80 @@ +# trainset has 4068193 samples +job: + mem: 32GB + cpus: 6 + gres: gpu:1 + time: 24:00:00 + +default: + config: dpp-qm7x-all + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces + optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient + log_train_every: 250 + energy_head: False + frame_averaging: 3D + fa_frames: random + optim: + batch_size: 100 + max_steps: 2000000 + warmup_steps: 3000 + lr_initial: 0.00025 + eval_every: 0.201 + energy_coefficient: 1 + energy_grad_coefficient: 0 + force_coefficient: 100 + # parameters EMA + ema_decay: 0.999 + loss_energy: mae + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + model: + act: swish + basis_emb_size: 8 + cutoff: 6.0 + energy_head: false + envelope_exponent: 5 + graph_rewiring: '' + hidden_channels: 256 + int_emb_size: 64 + max_num_neighbors: 40 + num_after_skip: 2 + num_before_skip: 1 + num_blocks: 3 + num_output_layers: 3 + num_radial: 6 + num_spherical: 7 + otf_graph: false + out_emb_channels: 192 + pg_hidden_channels: 32 + phys_embeds: false + phys_hidden_channels: 0 + regress_forces: 'from_energy' + tag_hidden_channels: 0 + use_pbc: false + dataset: + train: + rescale_with_hof: False + lse_shift: True + val_id: + lse_shift: True + test: + lse_shift: True + +runs: + - {} + - model: + num_blocks: 6 \ No newline at end of file From 37101f684c7caa1ecff8bc0239181b0aff3e8557 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:21:37 -0500 Subject: [PATCH 247/273] no FA for dpp --- configs/exps/icml/qm7x/dpp-v1.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/exps/icml/qm7x/dpp-v1.yaml b/configs/exps/icml/qm7x/dpp-v1.yaml index dc663d6870..45651d6ecf 100644 --- a/configs/exps/icml/qm7x/dpp-v1.yaml +++ b/configs/exps/icml/qm7x/dpp-v1.yaml @@ -18,14 +18,14 @@ default: optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient log_train_every: 250 energy_head: False - frame_averaging: 3D - fa_frames: random + frame_averaging: "" + fa_frames: "" optim: batch_size: 100 max_steps: 2000000 warmup_steps: 3000 lr_initial: 0.00025 - eval_every: 0.201 + eval_every: 1 energy_coefficient: 1 energy_grad_coefficient: 0 force_coefficient: 100 From 78d19f3aa99ee953eabc78c3cc7167b482498ac3 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:43:35 -0500 Subject: [PATCH 248/273] schnet dpp qm9 --- configs/exps/icml/qm9/schnet-dpp.yaml | 63 +++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 configs/exps/icml/qm9/schnet-dpp.yaml diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml new file mode 100644 index 0000000000..464ba0cb05 --- /dev/null +++ b/configs/exps/icml/qm9/schnet-dpp.yaml @@ -0,0 +1,63 @@ +# trainset has 4068193 samples +job: + mem: 32GB + cpus: 4 + gres: gpu:16gb:1 + time: 06:00:00 + +default: + wandb_project: ocp-qm + mode: train + test_ri: true + wandb_tags: qm7x + cp_data_to_tmpdir: true + note: + task: name + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces + optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient + log_train_every: 250 + energy_head: False + frame_averaging: "" + fa_frames: "" + optim: + batch_size: 100 + max_steps: 2000000 + warmup_steps: 3000 + lr_initial: 0.00025 + eval_every: 1 + energy_coefficient: 1 + energy_grad_coefficient: 0 + force_coefficient: 100 + # parameters EMA + ema_decay: 0.999 + loss_energy: mae + loss_force: mse + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.75 + threshold: 0.001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + dataset: + train: + rescale_with_hof: False + lse_shift: True + val_id: + lse_shift: True + test: + lse_shift: True + +runs: + - config: dpp-qm9-all + - config: schnet-qm9-all + model: + cutoff: 5 + hidden_channels: 128 + max_num_neighbors: 40 + num_filters: 128 + num_gaussians: 100 + num_interactions: 6 + pg_hidden_channels: 0 + phys_embeds: false \ No newline at end of file From 0276b3a89605d524fd5d71e6709b5d3a8ea03e37 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:56:06 -0500 Subject: [PATCH 249/273] pop checkpoint --- configs/exps/icml/qm9/schnet-dpp.yaml | 21 +++++++++++++++++---- ocpmodels/common/utils.py | 2 ++ 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml index 464ba0cb05..e9613f1358 100644 --- a/configs/exps/icml/qm9/schnet-dpp.yaml +++ b/configs/exps/icml/qm9/schnet-dpp.yaml @@ -3,7 +3,7 @@ job: mem: 32GB cpus: 4 gres: gpu:16gb:1 - time: 06:00:00 + time: "12:00:00" default: wandb_project: ocp-qm @@ -50,9 +50,22 @@ default: lse_shift: True runs: - - config: dpp-qm9-all - - config: schnet-qm9-all - model: +- config: dpp-qm9-all + model: + cutoff: 5 + num_spherical: 7 + num_radial: 6 + envelope_exponent: 5 + num_before_skip: 1 + num_after_skip: 2 + num_dense_output: 3 + optim: + batch_size: 32 + lr_initial: 0.001 + warmup_steps: 3000 + +- config: schnet-qm9-all + model: cutoff: 5 hidden_channels: 128 max_num_neighbors: 40 diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index b2867addf6..5e81d3a78b 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -995,6 +995,8 @@ def build_config(args, args_override, silent=False): continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] if args.continue_from_dir: continue_config["checkpoint"] = str(latest_ckpt) + else: + continue_config.pop("checkpoint", None) if not args.keep_orion_config: dels = {} for k in continue_config: From 3bf7f887bb45c5c10a4d4589951b6d6ac24e4862 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:56:56 -0500 Subject: [PATCH 250/273] drac version --- configs/exps/icml/qm9/schnet-dpp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml index e9613f1358..28a26648c7 100644 --- a/configs/exps/icml/qm9/schnet-dpp.yaml +++ b/configs/exps/icml/qm9/schnet-dpp.yaml @@ -2,7 +2,7 @@ job: mem: 32GB cpus: 4 - gres: gpu:16gb:1 + gres: gpu:1 time: "12:00:00" default: From 6372e53acf2e4b8ee0f58dfe14bd7a2e2cf4390d Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:58:45 -0500 Subject: [PATCH 251/273] indent --- configs/exps/icml/qm9/schnet-dpp.yaml | 44 +++++++++++++-------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml index 28a26648c7..bb8d7ac80a 100644 --- a/configs/exps/icml/qm9/schnet-dpp.yaml +++ b/configs/exps/icml/qm9/schnet-dpp.yaml @@ -50,27 +50,27 @@ default: lse_shift: True runs: -- config: dpp-qm9-all + - config: dpp-qm9-all + model: + cutoff: 5 + num_spherical: 7 + num_radial: 6 + envelope_exponent: 5 + num_before_skip: 1 + num_after_skip: 2 + num_dense_output: 3 + optim: + batch_size: 32 + lr_initial: 0.001 + warmup_steps: 3000 + + - config: schnet-qm9-all model: cutoff: 5 - num_spherical: 7 - num_radial: 6 - envelope_exponent: 5 - num_before_skip: 1 - num_after_skip: 2 - num_dense_output: 3 - optim: - batch_size: 32 - lr_initial: 0.001 - warmup_steps: 3000 - -- config: schnet-qm9-all - model: - cutoff: 5 - hidden_channels: 128 - max_num_neighbors: 40 - num_filters: 128 - num_gaussians: 100 - num_interactions: 6 - pg_hidden_channels: 0 - phys_embeds: false \ No newline at end of file + hidden_channels: 128 + max_num_neighbors: 40 + num_filters: 128 + num_gaussians: 100 + num_interactions: 6 + pg_hidden_channels: 0 + phys_embeds: false \ No newline at end of file From b852a102ff613d3682eb9d73795624f0aef6ddb7 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:59:14 -0500 Subject: [PATCH 252/273] typo --- configs/exps/icml/qm9/schnet-dpp.yaml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml index bb8d7ac80a..aa9987e216 100644 --- a/configs/exps/icml/qm9/schnet-dpp.yaml +++ b/configs/exps/icml/qm9/schnet-dpp.yaml @@ -51,18 +51,18 @@ default: runs: - config: dpp-qm9-all - model: - cutoff: 5 - num_spherical: 7 - num_radial: 6 - envelope_exponent: 5 - num_before_skip: 1 - num_after_skip: 2 - num_dense_output: 3 - optim: - batch_size: 32 - lr_initial: 0.001 - warmup_steps: 3000 + model: + cutoff: 5 + num_spherical: 7 + num_radial: 6 + envelope_exponent: 5 + num_before_skip: 1 + num_after_skip: 2 + num_dense_output: 3 + optim: + batch_size: 32 + lr_initial: 0.001 + warmup_steps: 3000 - config: schnet-qm9-all model: From de5fe57e25f9e8ad80ad13646e718c91bdb074c0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 16:59:51 -0500 Subject: [PATCH 253/273] update wandb tag --- configs/exps/icml/qm9/schnet-dpp.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml index aa9987e216..cf161b9fe5 100644 --- a/configs/exps/icml/qm9/schnet-dpp.yaml +++ b/configs/exps/icml/qm9/schnet-dpp.yaml @@ -9,7 +9,7 @@ default: wandb_project: ocp-qm mode: train test_ri: true - wandb_tags: qm7x + wandb_tags: qm9 cp_data_to_tmpdir: true note: task: name From 84ef5a6955bf9c117ec639d4deffc315c80defc3 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 17:10:19 -0500 Subject: [PATCH 254/273] tyupo in dataset --- configs/exps/icml/qm9/schnet-dpp.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml index cf161b9fe5..0d11460331 100644 --- a/configs/exps/icml/qm9/schnet-dpp.yaml +++ b/configs/exps/icml/qm9/schnet-dpp.yaml @@ -2,8 +2,8 @@ job: mem: 32GB cpus: 4 - gres: gpu:1 - time: "12:00:00" + gres: gpu:16gb:1 + # time: "12:00:00" default: wandb_project: ocp-qm @@ -44,7 +44,7 @@ default: train: rescale_with_hof: False lse_shift: True - val_id: + val: lse_shift: True test: lse_shift: True From 8d7509e82f3494b89d19dd2d30a9c0f803d17e93 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Tue, 24 Jan 2023 20:12:34 -0500 Subject: [PATCH 255/273] delete wandb resume id --- ocpmodels/common/utils.py | 2 ++ ocpmodels/trainers/base_trainer.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 5e81d3a78b..f2e09c762a 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -995,8 +995,10 @@ def build_config(args, args_override, silent=False): continue_config = torch.load((latest_ckpt), map_location="cpu")["config"] if args.continue_from_dir: continue_config["checkpoint"] = str(latest_ckpt) + continue_config["job_ids"] = continue_config["job_ids"] + f", {JOB_ID}" else: continue_config.pop("checkpoint", None) + continue_config.pop("wandb_resume_id", None) if not args.keep_orion_config: dels = {} for k in continue_config: diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 1c02706b49..13252ad414 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -433,7 +433,7 @@ def load_checkpoint(self, checkpoint_path): self.scaler.load_state_dict(checkpoint["amp"]) if "config" in checkpoint: - if "job_ids" in checkpoint["config"]: + if "job_ids" in checkpoint["config"] and JOB_ID not in checkpoint["config"]: self.config["job_ids"] = checkpoint["config"]["job_ids"] + f", {JOB_ID}" def load_loss(self): From 4c4b9ec2f18c180a5c8796d4488c61a73e158559 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 25 Jan 2023 00:41:44 -0500 Subject: [PATCH 256/273] nex exps --- configs/exps/icml/qm7x/dpp-v1.yaml | 17 ++--- configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml | 75 +++++++++++++++++++ configs/exps/icml/qm9/schnet-dpp.yaml | 58 +++++++++----- configs/models/tasks/qm9.yaml | 8 +- 4 files changed, 127 insertions(+), 31 deletions(-) create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml diff --git a/configs/exps/icml/qm7x/dpp-v1.yaml b/configs/exps/icml/qm7x/dpp-v1.yaml index 45651d6ecf..eb658eb136 100644 --- a/configs/exps/icml/qm7x/dpp-v1.yaml +++ b/configs/exps/icml/qm7x/dpp-v1.yaml @@ -1,9 +1,8 @@ # trainset has 4068193 samples job: - mem: 32GB - cpus: 6 - gres: gpu:1 - time: 24:00:00 + mem: 24GB + cpus: 5 + gres: gpu:24gb:1 default: config: dpp-qm7x-all @@ -12,10 +11,6 @@ default: test_ri: true wandb_tags: qm7x cp_data_to_tmpdir: true - note: - task: name - model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces - optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient log_train_every: 250 energy_head: False frame_averaging: "" @@ -25,7 +20,7 @@ default: max_steps: 2000000 warmup_steps: 3000 lr_initial: 0.00025 - eval_every: 1 + eval_every: 0 energy_coefficient: 1 energy_grad_coefficient: 0 force_coefficient: 100 @@ -77,4 +72,6 @@ default: runs: - {} - model: - num_blocks: 6 \ No newline at end of file + optim: + batch_size: 32 + num_blocks: 4 \ No newline at end of file diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml new file mode 100644 index 0000000000..afc11c43fb --- /dev/null +++ b/configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml @@ -0,0 +1,75 @@ +# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij +job: + mem: 8GB + cpus: 4 + gres: gpu:1 + time: 02:55:00 + partition: long + +default: + wandb_project: ocp-qm + config: fanet-qm9-all + mode: train + test_ri: true + wandb_tags: qm9, orion + log_train_every: 200 + optim: + batch_size: 64 + warmup_steps: 3000 + # parameters EMA + ema_decay: 0.999 + loss_energy: mse + # early stopping + es_patience: 20 + es_min_abs_change: 0.000001 + es_warmup_epochs: 650 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.95 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 10 + note: + model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm + optim: batch_size, lr_initial + _root_: frame_averaging, fa_frames + orion_mult_factor: + value: 25 + targets: num_filters, hidden_channels, num_gaussians + frame_averaging: 3D + fa_frames: random + model: + mp_type: updownscale_base + edge_embed_type: all_rij + energy_head: "" + num_gaussians: 100 + pg_hidden_channels: 32 + phys_embeds: True + second_layer_MLP: True + skip_co: True + complex_mp: True + graph_norm: True + +orion: + # Remember to change the experiment name if you change anything in the search space + n_jobs: 30 + + unique_exp_name: fanet-qm9-v7.0.0 + + space: + optim/max_epochs: fidelity(650, 1000, base=8) + optim/lr_initial: loguniform(1e-4, 1e-3, precision=3) + model/cutoff: uniform(4.5, 6.5, precision=1) + model/hidden_channels: uniform(10, 20, discrete=True) + model/max_num_neighbors: choices([30, 40, 50]) + model/num_gaussians: choices([50, 100, 150]) + model/num_filters: uniform(10, 20, discrete=True) + model/num_interactions: uniform(3, 6, discrete=True) + algorithms: + asha: + seed: 123 + num_rungs: 3 + num_brackets: 2 diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml index 0d11460331..33683721bc 100644 --- a/configs/exps/icml/qm9/schnet-dpp.yaml +++ b/configs/exps/icml/qm9/schnet-dpp.yaml @@ -1,8 +1,8 @@ # trainset has 4068193 samples job: mem: 32GB - cpus: 4 - gres: gpu:16gb:1 + cpus: 6 + gres: gpu:rtx8000:1 # time: "12:00:00" default: @@ -11,10 +11,7 @@ default: test_ri: true wandb_tags: qm9 cp_data_to_tmpdir: true - note: - task: name - model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces - optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient + note: "qm9 dpp schnet baselines" log_train_every: 250 energy_head: False frame_averaging: "" @@ -24,7 +21,7 @@ default: max_steps: 2000000 warmup_steps: 3000 lr_initial: 0.00025 - eval_every: 1 + eval_every: 0 # 0 is n_train energy_coefficient: 1 energy_grad_coefficient: 0 force_coefficient: 100 @@ -50,6 +47,7 @@ default: lse_shift: True runs: + # # https://github.com/gasteigerjo/dimenet/blob/master/config.yaml - config: dpp-qm9-all model: cutoff: 5 @@ -64,13 +62,39 @@ runs: lr_initial: 0.001 warmup_steps: 3000 - - config: schnet-qm9-all - model: - cutoff: 5 - hidden_channels: 128 - max_num_neighbors: 40 - num_filters: 128 - num_gaussians: 100 - num_interactions: 6 - pg_hidden_channels: 0 - phys_embeds: false \ No newline at end of file + # # + # - config: schnet-qm9-all + # model: + # cutoff: 5 + # hidden_channels: 128 + # max_num_neighbors: 40 + # num_filters: 128 + # num_gaussians: 100 + # num_interactions: 6 + # pg_hidden_channels: 0 + # phys_embeds: false + # https://github.com/atomistic-machine-learning/SchNet/blob/master/scripts/train_energy_force.py#L149 + # - config: schnet-qm9-all + # optim: + # batch_size: 32 + # model: + # cutoff: 20 + # num_interactions: 6 + # num_gaussians: 64 + # num_filters: 64 + # hidden_channels: 128 + # pg_hidden_channels: 0 + # phys_embeds: false + # # https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/models/schnet.html#SchNet + # - config: schnet-qm9-all + # optim: + # batch_size: 32 + # model: + # max_num_neighbors: 32 + # cutoff: 10 + # num_gaussians: 50 + # num_interactions: 6 + # num_filters: 128 + # hidden_channels: 128 + # pg_hidden_channels: 0 + # phys_embeds: false diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml index ecdc1d1dac..b13954b393 100644 --- a/configs/models/tasks/qm9.yaml +++ b/configs/models/tasks/qm9.yaml @@ -21,13 +21,13 @@ default: - internal energy at 0K normalizer: null - + # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.QM9.html mode: train dataset: default_val: val train: src: /network/projects/ocp/qm9 - target: 12 # predict atomization energy at 0K at index 12 + target: 7 seed: 123 normalize_labels: True # mean and std of target will be set by qm9.py if this is True lse_shift: true @@ -36,7 +36,7 @@ default: end: 110000 val: src: /network/projects/ocp/qm9 - target: 12 # predict atomization energy at 0K at index 12 + target: 7 seed: 123 lse_shift: true indices: @@ -44,7 +44,7 @@ default: end: 120000 test: src: /network/projects/ocp/qm9 - target: 12 # predict atomization energy at 0K at index 12 + target: 7 seed: 123 lse_shift: true indices: From 671e00a02b8c3a869afd3394f7f0ad237d600feb Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 25 Jan 2023 11:37:18 -0500 Subject: [PATCH 257/273] add spherenet FROM DIG (PIUP INSTALL) --- configs/models/spherenet.yaml | 69 +++++++++++++++++++++++++++++++++++ ocpmodels/models/spherenet.py | 58 +++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 configs/models/spherenet.yaml create mode 100644 ocpmodels/models/spherenet.py diff --git a/configs/models/spherenet.yaml b/configs/models/spherenet.yaml new file mode 100644 index 0000000000..1cea3c24bc --- /dev/null +++ b/configs/models/spherenet.yaml @@ -0,0 +1,69 @@ +default: + model: + name: spherenet + use_pbc: True + +# ------------------- +# ----- IS2RE ----- +# ------------------- + +is2re: + 10k: {} + 100k: {} + all: {} + +# ------------------ +# ----- S2EF ----- +# ------------------ + +s2ef: + default: {} + 200k: {} + 2M: {} + 20M: {} + all: {} + +qm9: + default: + model: + basis_emb_size_angle: 8 + basis_emb_size_dist: 8 + basis_emb_size_torsion: 8 + cutoff: 5.0 + energy_and_force: False + envelope_exponent: 5 + hidden_channels: 128 + int_emb_size: 64 + num_after_skip: 2 + num_before_skip: 1 + num_layers: 4 + num_output_layers: 3 + num_radial: 6 + num_spherical: 3 + out_channels: 1 + out_emb_channels: 256 + optim: + batch_size: 1024 + lr_initial: 0.001 + max_epochs: 1000 + decay_steps: 125000 + decay_rate: 0.01 + ema_decay: 0.999 + lr_gamma: 0.25 + lr_milestones: + - 17981 + - 26972 + - 35963 + - 52000 + - 100000 + warmup_steps: 1000 + + 10k: {} + all: {} + +qm7x: + default: {} + all: {} + 1k: {} + + diff --git a/ocpmodels/models/spherenet.py b/ocpmodels/models/spherenet.py new file mode 100644 index 0000000000..7a5be57181 --- /dev/null +++ b/ocpmodels/models/spherenet.py @@ -0,0 +1,58 @@ +from dig.threedgraph.method import SphereNet as DIGSphereNet +from ocpmodels.models.base_model import BaseModel +import torch +from ocpmodels.common.registry import registry +from ocpmodels.common.utils import conditional_grad +from copy import deepcopy + + +class SphereNet(BaseModel): + def __init__(self, **kwargs): + super().__init__() + self.energy_and_force = kwargs.get("energy_and_force", False) + self.cutoff = kwargs.get("cutoff", 5.0) + self.num_layers = kwargs.get("num_layers", 4) + self.hidden_channels = kwargs.get("hidden_channels", 128) + self.out_channels = kwargs.get("out_channels", 1) + self.int_emb_size = kwargs.get("int_emb_size", 64) + self.basis_emb_size_dist = kwargs.get("basis_emb_size_dist", 8) + self.basis_emb_size_angle = kwargs.get("basis_emb_size_angle", 8) + self.basis_emb_size_torsion = kwargs.get("basis_emb_size_torsion", 8) + self.out_emb_channels = kwargs.get("out_emb_channels", 256) + self.num_spherical = kwargs.get("num_spherical", 3) + self.num_radial = kwargs.get("num_radial", 6) + self.envelope_exponent = kwargs.get("envelope_exponent", 5) + self.num_before_skip = kwargs.get("num_before_skip", 1) + self.num_after_skip = kwargs.get("num_after_skip", 2) + self.num_output_layers = kwargs.get("num_output_layers", 3) + self.spherenet = DIGSphereNet( + energy_and_force=self.energy_and_force, + cutoff=self.cutoff, + num_layers=self.num_layers, + hidden_channels=self.hidden_channels, + out_channels=self.out_channels, + int_emb_size=self.int_emb_size, + basis_emb_size_dist=self.basis_emb_size_dist, + basis_emb_size_angle=self.basis_emb_size_angle, + basis_emb_size_torsion=self.basis_emb_size_torsion, + out_emb_channels=self.out_emb_channels, + num_spherical=self.num_spherical, + num_radial=self.num_radial, + envelope_exponent=self.envelope_exponent, + num_before_skip=self.num_before_skip, + num_after_skip=self.num_after_skip, + num_output_layers=self.num_output_layers, + ) + + @conditional_grad(torch.enable_grad()) + def forces_forward(self, preds): + return + + @conditional_grad(torch.enable_grad()) + def energy_forward(self, data): + # Rewire the graph + z = data.atomic_numbers.long() + batch_data = deepcopy(data) + batch_data.z = z + + return self.spherenet.forward(batch_data) From 37333a83e3485663a4d55a9938747e53147b0703 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:02:48 -0500 Subject: [PATCH 258/273] setup distributed in build config --- main.py | 4 ---- ocpmodels/common/dist_utils.py | 3 ++- ocpmodels/common/utils.py | 2 ++ 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index cb81c7fb61..3060ec09c2 100644 --- a/main.py +++ b/main.py @@ -94,10 +94,6 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None): trainer_config = build_config(args, override_args) - if args.distributed: - dist_utils.setup(trainer_config) - print("Distributed backend setup.") - if dist_utils.is_master(): trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config) dist_utils.synchronize() diff --git a/ocpmodels/common/dist_utils.py b/ocpmodels/common/dist_utils.py index 024b98e280..aab74a83ae 100644 --- a/ocpmodels/common/dist_utils.py +++ b/ocpmodels/common/dist_utils.py @@ -14,7 +14,8 @@ def setup(config): - assert config["distributed"] + if not config["distributed"]: + return node_list = os.environ.get("SLURM_STEP_NODELIST") if node_list is None: node_list = os.environ.get("SLURM_JOB_NODELIST") diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index f2e09c762a..bf17f5cb6c 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -38,6 +38,7 @@ import ocpmodels from ocpmodels.common.flags import flags from ocpmodels.common.registry import registry +import ocpmodels.common.dist_utils as dist_utils class Cluster: @@ -1090,6 +1091,7 @@ def build_config(args, args_override, silent=False): config = continue_from_slurm_job_id(config) config = read_slurm_env(config) config["optim"]["eval_batch_size"] = config["optim"]["batch_size"] + dist_utils.setup(config) return config From b1f51261e9ec464ac17d2050c7182124f58b2490 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:03:09 -0500 Subject: [PATCH 259/273] handle bool no arg in `create_dict_from_args` --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index bf17f5cb6c..a7b6231e60 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -807,7 +807,7 @@ def create_dict_from_args(args: list, sep: str = "."): return_dict = {} for arg in args: arg = arg.strip("--") - keys_concat, val = arg.split("=") + keys_concat, val = arg.split("=") if "=" in arg else (arg, "True") val = parse_value(val) key_sequence = keys_concat.split(sep) dict_set_recursively(return_dict, key_sequence, val) From 760c220a18be00a8d858b090f59b2f95f2262d5e Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:03:23 -0500 Subject: [PATCH 260/273] clean up continued congif from previous timestamp, comit, distributed port etc. --- ocpmodels/common/utils.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index a7b6231e60..9e59856e54 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1041,6 +1041,18 @@ def build_config(args, args_override, silent=False): config["world_size"] = args.num_nodes * args.num_gpus if continue_config: + continue_config.pop("timestamp_id", None) + continue_config.pop("commit", None) + continue_config.pop("early_stopping_file", None) + continue_config.pop("timestamp_id", None) + continue_config.pop("distributed_port", None) + continue_config.pop("continue_from_dir", None) + continue_config.pop("restart_from_dir", None) + + continue_config["run_dir"] = resolve(continue_config["run_dir"]) + continue_config["job_id"] = JOB_ID + continue_config["local_rank"] = config["local_rank"] + new_dirs = [ (k, v) for k, v in config.items() if "dir" in k and k != "cp_data_to_tmpdir" ] From 7ce80b0ad700b3b6747cdbf8486e5b2d8a8f7532 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:03:40 -0500 Subject: [PATCH 261/273] also resolve Paths in new_dirs when building conf --- ocpmodels/common/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index 9e59856e54..fd4e879512 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1067,7 +1067,7 @@ def build_config(args, args_override, silent=False): ) config = merge_dicts( continue_config, - {k: resolve(v) if isinstance(v, str) else v for k, v in new_dirs}, + {k: resolve(v) if isinstance(v, (str, Path)) else v for k, v in new_dirs}, ) config["dataset"] = merge_dicts(config["dataset"], data_srcs) cli = cli_args_dict() From 785010355603269baf6c37e359677df6ff26bc08 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:03:57 -0500 Subject: [PATCH 262/273] handle list of dicts in merge_dicts --- ocpmodels/common/utils.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index fd4e879512..e093bd5850 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1459,9 +1459,9 @@ def merge_dicts(dict1: dict, dict2: dict) -> dict: Merged dictionaries. """ if not isinstance(dict1, dict): - raise ValueError(f"Expecting dict1 to be dict, found {type(dict1)}.") + raise ValueError(f"Expecting dict1 to be dict, found {type(dict1)} {dict1}.") if not isinstance(dict2, dict): - raise ValueError(f"Expecting dict2 to be dict, found {type(dict2)}.") + raise ValueError(f"Expecting dict2 to be dict, found {type(dict2)} {dict2}.") return_dict = copy.deepcopy(dict1) @@ -1477,7 +1477,21 @@ def merge_dicts(dict1: dict, dict2: dict) -> dict: f"List for key {k} has different length in dict1 and dict2." + " Use an empty dict {} to pad for items in the shorter list." ) - return_dict[k] = [merge_dicts(d1, d2) for d1, d2 in zip(dict1[k], v)] + if isinstance(dict1[k][0], dict): + if not isinstance(dict2[k][0], dict): + raise ValueError( + f"Expecting dict for key {k} in dict2. ({dict1}, {dict2})" + ) + return_dict[k] = [ + merge_dicts(d1, d2) for d1, d2 in zip(dict1[k], v) + ] + else: + if isinstance(dict2[k][0], dict): + raise ValueError( + f"Expecting dict for key {k} in dict1. ({dict1}, {dict2})" + ) + return_dict[k] = v + else: return_dict[k] = dict2[k] From 039e2dcb6c3ba92cf3f8faba88e4e093f29af849 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:04:10 -0500 Subject: [PATCH 263/273] fix cuda erro in `segment_coo`, use `unique` --- ocpmodels/common/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py index e093bd5850..0603b338be 100644 --- a/ocpmodels/common/utils.py +++ b/ocpmodels/common/utils.py @@ -1537,7 +1537,9 @@ def compute_neighbors(data, edge_index): # Get number of neighbors # segment_coo assumes sorted index ones = edge_index[1].new_ones(1).expand_as(edge_index[1]) - num_neighbors = segment_coo(ones, edge_index[1], dim_size=data.natoms.sum()) + # CUDA error, changing (victor 2023-01-25) + # num_neighbors = segment_coo(ones, edge_index[1], dim_size=data.natoms.sum()) + _, num_neighbors = torch.unique(edge_index[1], return_counts=True) # Get number of neighbors per image image_indptr = torch.zeros( From a0dd92fecadc6234e6759a5665f1f9669e341d65 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:06:12 -0500 Subject: [PATCH 264/273] Add PAINN model --- configs/models/painn.yaml | 70 ++ ocpmodels/common/scaling/__init__.py | 3 + ocpmodels/common/scaling/compat.py | 76 ++ ocpmodels/common/scaling/fit.py | 241 +++++ ocpmodels/common/scaling/scale_factor.py | 170 ++++ ocpmodels/common/scaling/util.py | 23 + ocpmodels/models/painn.py | 879 ++++++++++++++++++ ocpmodels/models/painn_nb6_scaling_factors.pt | Bin 0 -> 2199 bytes 8 files changed, 1462 insertions(+) create mode 100644 configs/models/painn.yaml create mode 100644 ocpmodels/common/scaling/__init__.py create mode 100644 ocpmodels/common/scaling/compat.py create mode 100644 ocpmodels/common/scaling/fit.py create mode 100644 ocpmodels/common/scaling/scale_factor.py create mode 100644 ocpmodels/common/scaling/util.py create mode 100644 ocpmodels/models/painn.py create mode 100644 ocpmodels/models/painn_nb6_scaling_factors.pt diff --git a/configs/models/painn.yaml b/configs/models/painn.yaml new file mode 100644 index 0000000000..2c0abac112 --- /dev/null +++ b/configs/models/painn.yaml @@ -0,0 +1,70 @@ +default: + model: + name: painn + use_pbc: True + +# ------------------- +# ----- IS2RE ----- +# ------------------- + +is2re: + 10k: {} + 100k: {} + all: {} + +# ------------------ +# ----- S2EF ----- +# ------------------ + +s2ef: + default: {} + 200k: {} + 2M: {} + 20M: {} + all: {} + +qm9: + default: + model: + num_atoms: null # useless + bond_feat_dim: null # useless + num_targets: null # useless + hidden_channels: 512 + num_layers: 6 + num_rbf: 128 + cutoff: 12.0 + max_neighbors: 50 + rbf: {"name": "gaussian"} + envelope: {"name": "polynomial", "exponent": 5} + regress_forces: False + direct_forces: True + use_pbc: False + otf_graph: False + num_elements: 83 + optim: + batch_size: 100 + num_workers: 4 + lr_initial: 0.001 + max_epochs: 1000 + decay_steps: 125000 + decay_rate: 0.01 + ema_decay: 0.999 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.95 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 10 + + 10k: {} + all: {} + +qm7x: + default: {} + all: {} + 1k: {} + + diff --git a/ocpmodels/common/scaling/__init__.py b/ocpmodels/common/scaling/__init__.py new file mode 100644 index 0000000000..807416b066 --- /dev/null +++ b/ocpmodels/common/scaling/__init__.py @@ -0,0 +1,3 @@ +from .scale_factor import ScaleFactor + +__all__ = ["ScaleFactor"] diff --git a/ocpmodels/common/scaling/compat.py b/ocpmodels/common/scaling/compat.py new file mode 100644 index 0000000000..4240db0556 --- /dev/null +++ b/ocpmodels/common/scaling/compat.py @@ -0,0 +1,76 @@ +import json +import logging +from pathlib import Path +from typing import Dict, Optional, Union + +import torch +import torch.nn as nn + +from .scale_factor import ScaleFactor + +ScaleDict = Union[Dict[str, float], Dict[str, torch.Tensor]] + + +def _load_scale_dict(scale_file: Optional[Union[str, ScaleDict]]): + """ + Loads scale factors from either: + - a JSON file mapping scale factor names to scale values + - a python dictionary pickled object (loaded using `torch.load`) mapping scale factor names to scale values + - a dictionary mapping scale factor names to scale values + """ + if not scale_file: + return None + + if isinstance(scale_file, dict): + if not scale_file: + logging.warning("Empty scale dictionary provided to model.") + return scale_file + + path = Path(scale_file) + if not path.exists(): + raise ValueError(f"Scale file {path} does not exist.") + + scale_dict: Optional[ScaleDict] = None + if path.suffix == ".pt": + scale_dict = torch.load(path) + elif path.suffix == ".json": + with open(path, "r") as f: + scale_dict = json.load(f) + + if isinstance(scale_dict, dict): + # old json scale factors have a comment field that has the model name + scale_dict.pop("comment", None) + else: + raise ValueError(f"Unsupported scale file extension: {path.suffix}") + + if not scale_dict: + return None + + return scale_dict + + +def load_scales_compat( + module: nn.Module, scale_file: Optional[Union[str, ScaleDict]] +): + scale_dict = _load_scale_dict(scale_file) + if not scale_dict: + return + + scale_factors = { + module.name or name: (module, name) + for name, module in module.named_modules() + if isinstance(module, ScaleFactor) + } + logging.debug( + f"Found the following scale factors: {[(k, name) for k, (_, name) in scale_factors.items()]}" + ) + for name, scale in scale_dict.items(): + if name not in scale_factors: + logging.warning(f"Scale factor {name} not found in model") + continue + + scale_module, module_name = scale_factors[name] + logging.debug( + f"Loading scale factor {scale} for ({name} => {module_name})" + ) + scale_module.set_(scale) diff --git a/ocpmodels/common/scaling/fit.py b/ocpmodels/common/scaling/fit.py new file mode 100644 index 0000000000..83f1f72c7d --- /dev/null +++ b/ocpmodels/common/scaling/fit.py @@ -0,0 +1,241 @@ +import logging +import math +import readline +import sys +from itertools import islice +from pathlib import Path +from typing import TYPE_CHECKING, Dict, Literal + +import torch +import torch.nn as nn +from torch.nn.parallel.distributed import DistributedDataParallel + +from ocpmodels.common.data_parallel import OCPDataParallel +from ocpmodels.common.flags import flags +from ocpmodels.common.utils import ( + build_config, + new_trainer_context, + setup_logging, +) +from ocpmodels.modules.scaling import ScaleFactor +from ocpmodels.modules.scaling.compat import load_scales_compat + +if TYPE_CHECKING: + from ocpmodels.trainers.base_trainer import BaseTrainer + + +def _prefilled_input(prompt: str, prefill: str = ""): + readline.set_startup_hook(lambda: readline.insert_text(prefill)) + try: + return input(prompt) + finally: + readline.set_startup_hook() + + +def _train_batch(trainer: "BaseTrainer", batch): + with torch.no_grad(): + with torch.cuda.amp.autocast(enabled=trainer.scaler is not None): + out = trainer._forward(batch) + loss = trainer._compute_loss(out, batch) + del out, loss + + +def main(*, num_batches: int = 16): + # region args/config setup + setup_logging() + + parser = flags.get_parser() + args, override_args = parser.parse_known_args() + _config = build_config(args, override_args) + _config["logger"] = "tensorboard" + # endregion + + assert not args.distributed, "This doesn't work with DDP" + with new_trainer_context(args=args, config=_config) as ctx: + config = ctx.config + trainer = ctx.trainer + + ckpt_file = config.get("checkpoint", None) + assert ( + ckpt_file is not None + ), "Checkpoint file not specified. Please specify --checkpoint " + ckpt_file = Path(ckpt_file) + + logging.info( + f"Input checkpoint path: {ckpt_file}, {ckpt_file.exists()=}" + ) + + model: nn.Module = trainer.model + val_loader = trainer.val_loader + assert ( + val_loader is not None + ), "Val dataset is required for making predictions" + + if ckpt_file.exists(): + trainer.load_checkpoint(str(ckpt_file)) + + # region reoad scale file contents if necessary + # unwrap module from DP/DDP + unwrapped_model = model + while isinstance( + unwrapped_model, (DistributedDataParallel, OCPDataParallel) + ): + unwrapped_model = unwrapped_model.module + assert isinstance( + unwrapped_model, nn.Module + ), "Model is not a nn.Module" + load_scales_compat(unwrapped_model, config.get("scale_file", None)) + # endregion + + model.eval() + + # recursively go through the submodules and get the ScaleFactor modules + scale_factors: Dict[str, ScaleFactor] = { + name: module + for name, module in model.named_modules() + if isinstance(module, ScaleFactor) + } + + mode: Literal["all", "unfitted"] = "all" + + # region detect fitted/unfitted factors + fitted_scale_factors = [ + f"{name}: {module.scale_factor.item():.3f}" + for name, module in scale_factors.items() + if module.fitted + ] + unfitted_scale_factors = [ + name for name, module in scale_factors.items() if not module.fitted + ] + fitted_scale_factors_str = ", ".join(fitted_scale_factors) + logging.info(f"Fitted scale factors: [{fitted_scale_factors_str}]") + unfitted_scale_factors_str = ", ".join(unfitted_scale_factors) + logging.info(f"Unfitted scale factors: [{unfitted_scale_factors_str}]") + + if fitted_scale_factors: + flag = input( + "Do you want to continue and fit all scale factors (1), " + "only fit the variables not fitted yet (2), or exit (3)? " + ) + if str(flag) == "1": + mode = "all" + logging.info("Fitting all scale factors.") + elif str(flag) == "2": + mode = "unfitted" + logging.info("Only fitting unfitted variables.") + else: + print(flag) + logging.info("Exiting script") + sys.exit() + # endregion + + # region get the output path + out_path = Path( + _prefilled_input( + "Enter output path for fitted scale factors: ", + prefill=str(ckpt_file), + ) + ) + if out_path.exists(): + logging.warning(f"Already found existing file: {out_path}") + flag = input( + "Do you want to continue and overwrite existing file (1), " + "or exit (2)? " + ) + if str(flag) == "1": + logging.info("Overwriting existing file.") + else: + logging.info("Exiting script") + sys.exit() + + logging.info( + f"Output path for fitted scale factors: {out_path}, {out_path.exists()=}" + ) + # endregion + + # region reset the scale factors if mode == "all" + if mode == "all": + logging.info("Fitting all scale factors.") + for name, scale_factor in scale_factors.items(): + if scale_factor.fitted: + logging.info( + f"{name} is already fitted in the checkpoint, resetting it. {scale_factor.scale_factor}" + ) + scale_factor.reset_() + # endregion + + # region we do a single pass through the network to get the correct execution order of the scale factors + scale_factor_indices: Dict[str, int] = {} + max_idx = 0 + + # initialize all scale factors + for name, module in scale_factors.items(): + + def index_fn(name=name): + nonlocal max_idx + assert name is not None + if name not in scale_factor_indices: + scale_factor_indices[name] = max_idx + logging.debug(f"Scale factor for {name} = {max_idx}") + max_idx += 1 + + module.initialize_(index_fn=index_fn) + + # single pass through network + _train_batch(trainer, next(iter(val_loader))) + + # sort the scale factors by their computation order + sorted_factors = sorted( + scale_factors.items(), + key=lambda x: scale_factor_indices.get(x[0], math.inf), + ) + + logging.info("Sorted scale factors by computation order:") + for name, _ in sorted_factors: + logging.info(f"{name}: {scale_factor_indices[name]}") + + # endregion + + # loop over the scale factors in the computation order + # and fit them one by one + logging.info("Start fitting") + + for name, module in sorted_factors: + if mode == "unfitted" and module.fitted: + logging.info(f"Skipping {name} (already fitted)") + continue + + logging.info(f"Fitting {name}...") + with module.fit_context_(): + for batch in islice(val_loader, num_batches): + _train_batch(trainer, batch) + stats, ratio, value = module.fit_() + + logging.info( + f"Variable: {name}, " + f"Var_in: {stats['variance_in']:.3f}, " + f"Var_out: {stats['variance_out']:.3f}, " + f"Ratio: {ratio:.3f} => Scaling factor: {value:.3f}" + ) + + # make sure all scale factors are fitted + for name, module in sorted_factors: + assert module.fitted, f"{name} is not fitted" + + # region save the scale factors to the checkpoint file + trainer.config["cmd"]["checkpoint_dir"] = out_path.parent + trainer.is_debug = False + out_file = trainer.save( + metrics=None, + checkpoint_file=out_path.name, + training_state=False, + ) + assert out_file is not None, "Failed to save checkpoint" + out_file = Path(out_file) + assert out_file.exists(), f"Failed to save checkpoint to {out_file}" + # endregion + logging.info(f"Saved results to: {out_file}") + + +if __name__ == "__main__": + main() diff --git a/ocpmodels/common/scaling/scale_factor.py b/ocpmodels/common/scaling/scale_factor.py new file mode 100644 index 0000000000..8a8d5a55a5 --- /dev/null +++ b/ocpmodels/common/scaling/scale_factor.py @@ -0,0 +1,170 @@ +import itertools +import logging +import math +from contextlib import contextmanager +from typing import Callable, Optional, TypedDict, Union + +import torch +import torch.nn as nn + + +class _Stats(TypedDict): + variance_in: float + variance_out: float + n_samples: int + + +IndexFn = Callable[[], None] + + +def _check_consistency(old: torch.Tensor, new: torch.Tensor, key: str): + if not torch.allclose(old, new): + raise ValueError( + f"Scale factor parameter {key} is inconsistent with the loaded state dict.\n" + f"Old: {old}\n" + f"Actual: {new}" + ) + + +class ScaleFactor(nn.Module): + scale_factor: torch.Tensor + + name: Optional[str] = None + index_fn: Optional[IndexFn] = None + stats: Optional[_Stats] = None + + def __init__( + self, + name: Optional[str] = None, + enforce_consistency: bool = True, + ): + super().__init__() + + self.name = name + self.index_fn = None + self.stats = None + + self.scale_factor = nn.parameter.Parameter( + torch.tensor(0.0), requires_grad=False + ) + if enforce_consistency: + self._register_load_state_dict_pre_hook(self._enforce_consistency) + + def _enforce_consistency( + self, + state_dict, + prefix, + _local_metadata, + _strict, + _missing_keys, + _unexpected_keys, + _error_msgs, + ): + if not self.fitted: + return + + persistent_buffers = { + k: v + for k, v in self._buffers.items() + if k not in self._non_persistent_buffers_set + } + local_name_params = itertools.chain( + self._parameters.items(), persistent_buffers.items() + ) + local_state = {k: v for k, v in local_name_params if v is not None} + + for name, param in local_state.items(): + key = prefix + name + if key not in state_dict: + continue + + input_param = state_dict[key] + _check_consistency(old=param, new=input_param, key=key) + + @property + def fitted(self): + return bool((self.scale_factor != 0.0).item()) + + @torch.jit.unused + def reset_(self): + self.scale_factor.zero_() + + @torch.jit.unused + def set_(self, scale: Union[float, torch.Tensor]): + if self.fitted: + _check_consistency( + old=self.scale_factor, + new=torch.tensor(scale) if isinstance(scale, float) else scale, + key="scale_factor", + ) + self.scale_factor.fill_(scale) + + @torch.jit.unused + def initialize_(self, *, index_fn: Optional[IndexFn] = None): + self.index_fn = index_fn + + @contextmanager + @torch.jit.unused + def fit_context_(self): + self.stats = _Stats(variance_in=0.0, variance_out=0.0, n_samples=0) + yield + del self.stats + self.stats = None + + @torch.jit.unused + def fit_(self): + assert self.stats, "Stats not set" + for k, v in self.stats.items(): + assert v > 0, f"{k} is {v}" + + self.stats["variance_in"] = ( + self.stats["variance_in"] / self.stats["n_samples"] + ) + self.stats["variance_out"] = ( + self.stats["variance_out"] / self.stats["n_samples"] + ) + + ratio = self.stats["variance_out"] / self.stats["variance_in"] + value = math.sqrt(1 / ratio) + + self.set_(value) + + stats = dict(**self.stats) + return stats, ratio, value + + @torch.no_grad() + @torch.jit.unused + def _observe(self, x: torch.Tensor, ref: Optional[torch.Tensor] = None): + if self.stats is None: + logging.debug("Observer not initialized but self.observe() called") + return + + n_samples = x.shape[0] + self.stats["variance_out"] += ( + torch.mean(torch.var(x, dim=0)).item() * n_samples + ) + + if ref is None: + self.stats["variance_in"] += n_samples + else: + self.stats["variance_in"] += ( + torch.mean(torch.var(ref, dim=0)).item() * n_samples + ) + self.stats["n_samples"] += n_samples + + def forward( + self, + x: torch.Tensor, + *, + ref: Optional[torch.Tensor] = None, + ): + if self.index_fn is not None: + self.index_fn() + + if self.fitted: + x = x * self.scale_factor + + if not torch.jit.is_scripting(): + self._observe(x, ref=ref) + + return x diff --git a/ocpmodels/common/scaling/util.py b/ocpmodels/common/scaling/util.py new file mode 100644 index 0000000000..15c58b5d42 --- /dev/null +++ b/ocpmodels/common/scaling/util.py @@ -0,0 +1,23 @@ +import logging + +import torch.nn as nn + +from .scale_factor import ScaleFactor + + +def ensure_fitted(module: nn.Module, warn: bool = False): + for name, child in module.named_modules(): + if not isinstance(child, ScaleFactor) or child.fitted: + continue + if child.name is not None: + name = f"{child.name} ({name})" + msg = ( + f"Scale factor {name} is not fitted. " + "Please make sure that you either (1) load a checkpoint with fitted scale factors, " + "(2) explicitly load scale factors using the `model.scale_file` attribute, or " + "(3) fit the scale factors using the `fit.py` script." + ) + if warn: + logging.warning(msg) + else: + raise ValueError(msg) diff --git a/ocpmodels/models/painn.py b/ocpmodels/models/painn.py new file mode 100644 index 0000000000..8b2f5d45c3 --- /dev/null +++ b/ocpmodels/models/painn.py @@ -0,0 +1,879 @@ +""" +Copyright (c) Facebook, Inc. and its affiliates. + +This source code is licensed under the MIT license found in the +LICENSE file in the root directory of this source tree. + +--- + +MIT License + +Copyright (c) 2021 www.compscience.org + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + +import logging +import math +import os +from typing import Optional, Tuple + +import torch +from torch import nn +from torch_geometric.nn import MessagePassing, radius_graph +from torch_scatter import scatter, segment_coo, segment_csr + +from ocpmodels.common.registry import registry +from ocpmodels.common.utils import ( + compute_neighbors, + conditional_grad, + get_pbc_distances, + radius_graph_pbc, + ROOT, +) +import numpy as np +from ocpmodels.models.base_model import BaseModel +from ocpmodels.models.gemnet.layers.base_layers import ScaledSiLU + +# from ocpmodels.models.gemnet.layers.embedding_block import AtomEmbedding # updated version copied here +from ocpmodels.models.gemnet.layers.radial_basis import RadialBasis +from ocpmodels.common.scaling import ScaleFactor +from ocpmodels.common.scaling.compat import load_scales_compat + +# from .utils import get_edge_id, repeat_blocks copied here + + +class AtomEmbedding(torch.nn.Module): + """ + Initial atom embeddings based on the atom type + + Parameters + ---------- + emb_size: int + Atom embeddings size + """ + + def __init__(self, emb_size, num_elements): + super().__init__() + self.emb_size = emb_size + + self.embeddings = torch.nn.Embedding(num_elements, emb_size) + # init by uniform distribution + torch.nn.init.uniform_(self.embeddings.weight, a=-np.sqrt(3), b=np.sqrt(3)) + + def forward(self, Z): + """ + Returns + ------- + h: torch.Tensor, shape=(nAtoms, emb_size) + Atom embeddings. + """ + h = self.embeddings(Z - 1) # -1 because Z.min()=1 (==Hydrogen) + return h + + +def repeat_blocks( + sizes, + repeats, + continuous_indexing=True, + start_idx=0, + block_inc=0, + repeat_inc=0, +): + """Repeat blocks of indices. + Adapted from https://stackoverflow.com/questions/51154989/numpy-vectorized-function-to-repeat-blocks-of-consecutive-elements + + continuous_indexing: Whether to keep increasing the index after each block + start_idx: Starting index + block_inc: Number to increment by after each block, + either global or per block. Shape: len(sizes) - 1 + repeat_inc: Number to increment by after each repetition, + either global or per block + + Examples + -------- + sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = False + Return: [0 0 0 0 1 2 0 1 2 0 1 0 1 0 1] + sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = True + Return: [0 0 0 1 2 3 1 2 3 4 5 4 5 4 5] + sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = True ; + repeat_inc = 4 + Return: [0 4 8 1 2 3 5 6 7 4 5 8 9 12 13] + sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = True ; + start_idx = 5 + Return: [5 5 5 6 7 8 6 7 8 9 10 9 10 9 10] + sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = True ; + block_inc = 1 + Return: [0 0 0 2 3 4 2 3 4 6 7 6 7 6 7] + sizes = [0,3,2] ; repeats = [3,2,3] ; continuous_indexing = True + Return: [0 1 2 0 1 2 3 4 3 4 3 4] + sizes = [2,3,2] ; repeats = [2,0,2] ; continuous_indexing = True + Return: [0 1 0 1 5 6 5 6] + """ + assert sizes.dim() == 1 + assert all(sizes >= 0) + + # Remove 0 sizes + sizes_nonzero = sizes > 0 + if not torch.all(sizes_nonzero): + assert block_inc == 0 # Implementing this is not worth the effort + sizes = torch.masked_select(sizes, sizes_nonzero) + if isinstance(repeats, torch.Tensor): + repeats = torch.masked_select(repeats, sizes_nonzero) + if isinstance(repeat_inc, torch.Tensor): + repeat_inc = torch.masked_select(repeat_inc, sizes_nonzero) + + if isinstance(repeats, torch.Tensor): + assert all(repeats >= 0) + insert_dummy = repeats[0] == 0 + if insert_dummy: + one = sizes.new_ones(1) + zero = sizes.new_zeros(1) + sizes = torch.cat((one, sizes)) + repeats = torch.cat((one, repeats)) + if isinstance(block_inc, torch.Tensor): + block_inc = torch.cat((zero, block_inc)) + if isinstance(repeat_inc, torch.Tensor): + repeat_inc = torch.cat((zero, repeat_inc)) + else: + assert repeats >= 0 + insert_dummy = False + + # Get repeats for each group using group lengths/sizes + r1 = torch.repeat_interleave(torch.arange(len(sizes), device=sizes.device), repeats) + + # Get total size of output array, as needed to initialize output indexing array + N = (sizes * repeats).sum() + + # Initialize indexing array with ones as we need to setup incremental indexing + # within each group when cumulatively summed at the final stage. + # Two steps here: + # 1. Within each group, we have multiple sequences, so setup the offsetting + # at each sequence lengths by the seq. lengths preceding those. + id_ar = torch.ones(N, dtype=torch.long, device=sizes.device) + id_ar[0] = 0 + insert_index = sizes[r1[:-1]].cumsum(0) + insert_val = (1 - sizes)[r1[:-1]] + + if isinstance(repeats, torch.Tensor) and torch.any(repeats == 0): + diffs = r1[1:] - r1[:-1] + indptr = torch.cat((sizes.new_zeros(1), diffs.cumsum(0))) + if continuous_indexing: + # If a group was skipped (repeats=0) we need to add its size + insert_val += segment_csr(sizes[: r1[-1]], indptr, reduce="sum") + + # Add block increments + if isinstance(block_inc, torch.Tensor): + insert_val += segment_csr(block_inc[: r1[-1]], indptr, reduce="sum") + else: + insert_val += block_inc * (indptr[1:] - indptr[:-1]) + if insert_dummy: + insert_val[0] -= block_inc + else: + idx = r1[1:] != r1[:-1] + if continuous_indexing: + # 2. For each group, make sure the indexing starts from the next group's + # first element. So, simply assign 1s there. + insert_val[idx] = 1 + + # Add block increments + insert_val[idx] += block_inc + + # Add repeat_inc within each group + if isinstance(repeat_inc, torch.Tensor): + insert_val += repeat_inc[r1[:-1]] + if isinstance(repeats, torch.Tensor): + repeat_inc_inner = repeat_inc[repeats > 0][:-1] + else: + repeat_inc_inner = repeat_inc[:-1] + else: + insert_val += repeat_inc + repeat_inc_inner = repeat_inc + + # Subtract the increments between groups + if isinstance(repeats, torch.Tensor): + repeats_inner = repeats[repeats > 0][:-1] + else: + repeats_inner = repeats + insert_val[r1[1:] != r1[:-1]] -= repeat_inc_inner * repeats_inner + + # Assign index-offsetting values + id_ar[insert_index] = insert_val + + if insert_dummy: + id_ar = id_ar[1:] + if continuous_indexing: + id_ar[0] -= 1 + + # Set start index now, in case of insertion due to leading repeats=0 + id_ar[0] += start_idx + + # Finally index into input array for the group repeated o/p + res = id_ar.cumsum(0) + return res + + +def get_edge_id(edge_idx, cell_offsets, num_atoms): + cell_basis = cell_offsets.max() - cell_offsets.min() + 1 + cell_id = ( + (cell_offsets * cell_offsets.new_tensor([[1, cell_basis, cell_basis**2]])) + .sum(-1) + .long() + ) + edge_id = edge_idx[0] + edge_idx[1] * num_atoms + cell_id * num_atoms**2 + return edge_id + + +@registry.register_model("painn") +class PaiNN(BaseModel): + r"""PaiNN model based on the description in Schütt et al. (2021): + Equivariant message passing for the prediction of tensorial properties + and molecular spectra, https://arxiv.org/abs/2102.03150. + """ + + def __init__(self, **kwargs): + super(PaiNN, self).__init__() + self.num_atoms = kwargs.get("num_atoms") + self.bond_feat_dim = kwargs.get("bond_feat_dim") + self.num_targets = kwargs.get("num_targets") + self.hidden_channels = kwargs.get("hidden_channels", 512) + self.num_layers = kwargs.get("num_layers", 6) + self.num_rbf = kwargs.get("num_rbf", 128) + self.cutoff = kwargs.get("cutoff", 12.0) + self.max_neighbors = kwargs.get("max_neighbors", 50) + self.rbf = kwargs.get("rbf", {"name": "gaussian"}) + self.envelope = kwargs.get("envelope", {"name": "polynomial", "exponent": 5}) + self.regress_forces = kwargs.get("regress_forces", True) + self.direct_forces = kwargs.get("direct_forces", True) + self.use_pbc = kwargs.get("use_pbc", True) + self.otf_graph = kwargs.get("otf_graph", True) + self.num_elements = kwargs.get("num_elements", 83) + self.scale_file = ROOT / "ocpmodels" / "models" / "painn_nb6_scaling_factors.pt" + + # Borrowed from GemNet. + self.symmetric_edge_symmetrization = False + + #### Learnable parameters ############################################# + + self.atom_emb = AtomEmbedding(self.hidden_channels, self.num_elements) + + self.radial_basis = RadialBasis( + num_radial=self.num_rbf, + cutoff=self.cutoff, + rbf=self.rbf, + envelope=self.envelope, + ) + + self.message_layers = nn.ModuleList() + self.update_layers = nn.ModuleList() + + for i in range(self.num_layers): + self.message_layers.append( + PaiNNMessage(self.hidden_channels, self.num_rbf).jittable() + ) + self.update_layers.append(PaiNNUpdate(self.hidden_channels)) + setattr(self, "upd_out_scalar_scale_%d" % i, ScaleFactor()) + + self.out_energy = nn.Sequential( + nn.Linear(self.hidden_channels, self.hidden_channels // 2), + ScaledSiLU(), + nn.Linear(self.hidden_channels // 2, 1), + ) + + if self.regress_forces is True and self.direct_forces is True: + self.out_forces = PaiNNOutput(self.hidden_channels) + + self.inv_sqrt_2 = 1 / math.sqrt(2.0) + + self.reset_parameters() + + load_scales_compat(self, self.scale_file) + + def reset_parameters(self): + nn.init.xavier_uniform_(self.out_energy[0].weight) + self.out_energy[0].bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_energy[2].weight) + self.out_energy[2].bias.data.fill_(0) + + # Borrowed from GemNet. + def select_symmetric_edges(self, tensor, mask, reorder_idx, inverse_neg): + # Mask out counter-edges + tensor_directed = tensor[mask] + # Concatenate counter-edges after normal edges + sign = 1 - 2 * inverse_neg + tensor_cat = torch.cat([tensor_directed, sign * tensor_directed]) + # Reorder everything so the edges of every image are consecutive + tensor_ordered = tensor_cat[reorder_idx] + return tensor_ordered + + # Borrowed from GemNet. + def symmetrize_edges( + self, + edge_index, + cell_offsets, + neighbors, + batch_idx, + reorder_tensors, + reorder_tensors_invneg, + ): + """ + Symmetrize edges to ensure existence of counter-directional edges. + + Some edges are only present in one direction in the data, + since every atom has a maximum number of neighbors. + If `symmetric_edge_symmetrization` is False, + we only use i->j edges here. So we lose some j->i edges + and add others by making it symmetric. + If `symmetric_edge_symmetrization` is True, + we always use both directions. + """ + num_atoms = batch_idx.shape[0] + + if self.symmetric_edge_symmetrization: + edge_index_bothdir = torch.cat( + [edge_index, edge_index.flip(0)], + dim=1, + ) + cell_offsets_bothdir = torch.cat( + [cell_offsets, -cell_offsets], + dim=0, + ) + + # Filter for unique edges + edge_ids = get_edge_id(edge_index_bothdir, cell_offsets_bothdir, num_atoms) + unique_ids, unique_inv = torch.unique(edge_ids, return_inverse=True) + perm = torch.arange( + unique_inv.size(0), + dtype=unique_inv.dtype, + device=unique_inv.device, + ) + unique_idx = scatter( + perm, + unique_inv, + dim=0, + dim_size=unique_ids.shape[0], + reduce="min", + ) + edge_index_new = edge_index_bothdir[:, unique_idx] + + # Order by target index + edge_index_order = torch.argsort(edge_index_new[1]) + edge_index_new = edge_index_new[:, edge_index_order] + unique_idx = unique_idx[edge_index_order] + + # Subindex remaining tensors + cell_offsets_new = cell_offsets_bothdir[unique_idx] + reorder_tensors = [ + self.symmetrize_tensor(tensor, unique_idx, False) + for tensor in reorder_tensors + ] + reorder_tensors_invneg = [ + self.symmetrize_tensor(tensor, unique_idx, True) + for tensor in reorder_tensors_invneg + ] + + # Count edges per image + # segment_coo assumes sorted edge_index_new[1] and batch_idx + ones = edge_index_new.new_ones(1).expand_as(edge_index_new[1]) + neighbors_per_atom = segment_coo( + ones, edge_index_new[1], dim_size=num_atoms + ) + neighbors_per_image = segment_coo( + neighbors_per_atom, batch_idx, dim_size=neighbors.shape[0] + ) + else: + # Generate mask + mask_sep_atoms = edge_index[0] < edge_index[1] + # Distinguish edges between the same (periodic) atom by ordering the cells + cell_earlier = ( + (cell_offsets[:, 0] < 0) + | ((cell_offsets[:, 0] == 0) & (cell_offsets[:, 1] < 0)) + | ( + (cell_offsets[:, 0] == 0) + & (cell_offsets[:, 1] == 0) + & (cell_offsets[:, 2] < 0) + ) + ) + mask_same_atoms = edge_index[0] == edge_index[1] + mask_same_atoms &= cell_earlier + mask = mask_sep_atoms | mask_same_atoms + + # Mask out counter-edges + edge_index_new = edge_index[mask[None, :].expand(2, -1)].view(2, -1) + + # Concatenate counter-edges after normal edges + edge_index_cat = torch.cat( + [edge_index_new, edge_index_new.flip(0)], + dim=1, + ) + + # Count remaining edges per image + batch_edge = torch.repeat_interleave( + torch.arange(neighbors.size(0), device=edge_index.device), + neighbors, + ) + batch_edge = batch_edge[mask] + # segment_coo assumes sorted batch_edge + # Factor 2 since this is only one half of the edges + ones = batch_edge.new_ones(1).expand_as(batch_edge) + neighbors_per_image = 2 * segment_coo( + ones, batch_edge, dim_size=neighbors.size(0) + ) + + # Create indexing array + edge_reorder_idx = repeat_blocks( + torch.div(neighbors_per_image, 2, rounding_mode="floor"), + repeats=2, + continuous_indexing=True, + repeat_inc=edge_index_new.size(1), + ) + + # Reorder everything so the edges of every image are consecutive + edge_index_new = edge_index_cat[:, edge_reorder_idx] + cell_offsets_new = self.select_symmetric_edges( + cell_offsets, mask, edge_reorder_idx, True + ) + reorder_tensors = [ + self.select_symmetric_edges(tensor, mask, edge_reorder_idx, False) + for tensor in reorder_tensors + ] + reorder_tensors_invneg = [ + self.select_symmetric_edges(tensor, mask, edge_reorder_idx, True) + for tensor in reorder_tensors_invneg + ] + + # Indices for swapping c->a and a->c (for symmetric MP) + # To obtain these efficiently and without any index assumptions, + # we get order the counter-edge IDs and then + # map this order back to the edge IDs. + # Double argsort gives the desired mapping + # from the ordered tensor to the original tensor. + edge_ids = get_edge_id(edge_index_new, cell_offsets_new, num_atoms) + order_edge_ids = torch.argsort(edge_ids) + inv_order_edge_ids = torch.argsort(order_edge_ids) + edge_ids_counter = get_edge_id( + edge_index_new.flip(0), -cell_offsets_new, num_atoms + ) + order_edge_ids_counter = torch.argsort(edge_ids_counter) + id_swap = order_edge_ids_counter[inv_order_edge_ids] + + return ( + edge_index_new, + cell_offsets_new, + neighbors_per_image, + reorder_tensors, + reorder_tensors_invneg, + id_swap, + ) + + def generate_graph( + self, + data, + cutoff=None, + max_neighbors=None, + use_pbc=None, + otf_graph=None, + ): + cutoff = cutoff or self.cutoff + max_neighbors = max_neighbors or self.max_neighbors + use_pbc = use_pbc or self.use_pbc + otf_graph = otf_graph or self.otf_graph + + if not otf_graph: + try: + edge_index = data.edge_index + + if use_pbc: + cell_offsets = data.cell_offsets + neighbors = data.neighbors + + except AttributeError: + logging.warning( + "Turning otf_graph=True as required attributes not present in data object" + ) + otf_graph = True + + if use_pbc: + if otf_graph: + edge_index, cell_offsets, neighbors = radius_graph_pbc( + data, cutoff, max_neighbors + ) + + out = get_pbc_distances( + data.pos, + edge_index, + data.cell, + cell_offsets, + neighbors, + return_offsets=True, + return_distance_vec=True, + ) + + edge_index = out["edge_index"] + edge_dist = out["distances"] + cell_offset_distances = out["offsets"] + distance_vec = out["distance_vec"] + else: + if otf_graph: + edge_index = radius_graph( + data.pos, + r=cutoff, + batch=data.batch, + max_num_neighbors=max_neighbors, + ) + + j, i = edge_index + distance_vec = data.pos[j] - data.pos[i] + + edge_dist = distance_vec.norm(dim=-1) + cell_offsets = torch.zeros(edge_index.shape[1], 3, device=data.pos.device) + cell_offset_distances = torch.zeros_like( + cell_offsets, device=data.pos.device + ) + neighbors = compute_neighbors(data, edge_index) + + return ( + edge_index, + edge_dist, + distance_vec, + cell_offsets, + cell_offset_distances, + neighbors, + ) + + def generate_graph_values(self, data): + ( + edge_index, + edge_dist, + distance_vec, + cell_offsets, + _, # cell offset distances + neighbors, + ) = self.generate_graph(data) + + # Unit vectors pointing from edge_index[1] to edge_index[0], + # i.e., edge_index[0] - edge_index[1] divided by the norm. + # make sure that the distances are not close to zero before dividing + mask_zero = torch.isclose(edge_dist, torch.tensor(0.0), atol=1e-6) + edge_dist[mask_zero] = 1.0e-6 + edge_vector = distance_vec / edge_dist[:, None] + + empty_image = neighbors == 0 + if torch.any(empty_image): + raise ValueError( + f"An image has no neighbors: id={data.id[empty_image]}, " + f"sid={data.sid[empty_image]}, fid={data.fid[empty_image]}" + ) + + # Symmetrize edges for swapping in symmetric message passing + ( + edge_index, + cell_offsets, + neighbors, + [edge_dist], + [edge_vector], + id_swap, + ) = self.symmetrize_edges( + edge_index, + cell_offsets, + neighbors, + data.batch, + [edge_dist], + [edge_vector], + ) + + return ( + edge_index, + neighbors, + edge_dist, + edge_vector, + id_swap, + ) + + @conditional_grad(torch.enable_grad()) + def forces_forward(self, preds): + return + + @conditional_grad(torch.enable_grad()) + def energy_forward(self, data): + pos = data.pos + batch = data.batch + z = data.atomic_numbers.long() + + if self.regress_forces and not self.direct_forces: + pos = pos.requires_grad_(True) + + ( + edge_index, + neighbors, + edge_dist, + edge_vector, + id_swap, + ) = self.generate_graph_values(data) + + assert z.dim() == 1 and z.dtype == torch.long + + edge_rbf = self.radial_basis(edge_dist) # rbf * envelope + + x = self.atom_emb(z) + vec = torch.zeros(x.size(0), 3, x.size(1), device=x.device) + + #### Interaction blocks ############################################### + + for i in range(self.num_layers): + dx, dvec = self.message_layers[i](x, vec, edge_index, edge_rbf, edge_vector) + + x = x + dx + vec = vec + dvec + x = x * self.inv_sqrt_2 + + dx, dvec = self.update_layers[i](x, vec) + + x = x + dx + vec = vec + dvec + x = getattr(self, "upd_out_scalar_scale_%d" % i)(x) + + #### Output block ##################################################### + + per_atom_energy = self.out_energy(x).squeeze(1) + energy = scatter(per_atom_energy, batch, dim=0) + + if self.regress_forces: + if self.direct_forces: + forces = self.out_forces(x, vec) + return energy, forces + else: + forces = ( + -1 + * torch.autograd.grad( + x, + pos, + grad_outputs=torch.ones_like(x), + create_graph=True, + )[0] + ) + return energy, forces + else: + return {"energy": energy} + + @property + def num_params(self): + return sum(p.numel() for p in self.parameters()) + + def __repr__(self): + return ( + f"{self.__class__.__name__}(" + f"hidden_channels={self.hidden_channels}, " + f"num_layers={self.num_layers}, " + f"num_rbf={self.num_rbf}, " + f"max_neighbors={self.max_neighbors}, " + f"cutoff={self.cutoff})" + ) + + +class PaiNNMessage(MessagePassing): + def __init__( + self, + hidden_channels, + num_rbf, + ): + super(PaiNNMessage, self).__init__(aggr="add", node_dim=0) + + self.hidden_channels = hidden_channels + + self.x_proj = nn.Sequential( + nn.Linear(hidden_channels, hidden_channels), + ScaledSiLU(), + nn.Linear(hidden_channels, hidden_channels * 3), + ) + self.rbf_proj = nn.Linear(num_rbf, hidden_channels * 3) + + self.inv_sqrt_3 = 1 / math.sqrt(3.0) + self.inv_sqrt_h = 1 / math.sqrt(hidden_channels) + self.x_layernorm = nn.LayerNorm(hidden_channels) + + self.reset_parameters() + + def reset_parameters(self): + nn.init.xavier_uniform_(self.x_proj[0].weight) + self.x_proj[0].bias.data.fill_(0) + nn.init.xavier_uniform_(self.x_proj[2].weight) + self.x_proj[2].bias.data.fill_(0) + nn.init.xavier_uniform_(self.rbf_proj.weight) + self.rbf_proj.bias.data.fill_(0) + self.x_layernorm.reset_parameters() + + def forward(self, x, vec, edge_index, edge_rbf, edge_vector): + xh = self.x_proj(self.x_layernorm(x)) + + # TODO(@abhshkdz): Nans out with AMP here during backprop. Debug / fix. + rbfh = self.rbf_proj(edge_rbf) + + # propagate_type: (xh: Tensor, vec: Tensor, rbfh_ij: Tensor, r_ij: Tensor) + dx, dvec = self.propagate( + edge_index, + xh=xh, + vec=vec, + rbfh_ij=rbfh, + r_ij=edge_vector, + size=None, + ) + + return dx, dvec + + def message(self, xh_j, vec_j, rbfh_ij, r_ij): + x, xh2, xh3 = torch.split(xh_j * rbfh_ij, self.hidden_channels, dim=-1) + xh2 = xh2 * self.inv_sqrt_3 + + vec = vec_j * xh2.unsqueeze(1) + xh3.unsqueeze(1) * r_ij.unsqueeze(2) + vec = vec * self.inv_sqrt_h + + return x, vec + + def aggregate( + self, + features: Tuple[torch.Tensor, torch.Tensor], + index: torch.Tensor, + ptr: Optional[torch.Tensor], + dim_size: Optional[int], + ) -> Tuple[torch.Tensor, torch.Tensor]: + x, vec = features + x = scatter(x, index, dim=self.node_dim, dim_size=dim_size) + vec = scatter(vec, index, dim=self.node_dim, dim_size=dim_size) + return x, vec + + def update( + self, inputs: Tuple[torch.Tensor, torch.Tensor] + ) -> Tuple[torch.Tensor, torch.Tensor]: + return inputs + + +class PaiNNUpdate(nn.Module): + def __init__(self, hidden_channels): + super().__init__() + self.hidden_channels = hidden_channels + + self.vec_proj = nn.Linear(hidden_channels, hidden_channels * 2, bias=False) + self.xvec_proj = nn.Sequential( + nn.Linear(hidden_channels * 2, hidden_channels), + ScaledSiLU(), + nn.Linear(hidden_channels, hidden_channels * 3), + ) + + self.inv_sqrt_2 = 1 / math.sqrt(2.0) + self.inv_sqrt_h = 1 / math.sqrt(hidden_channels) + + self.reset_parameters() + + def reset_parameters(self): + nn.init.xavier_uniform_(self.vec_proj.weight) + nn.init.xavier_uniform_(self.xvec_proj[0].weight) + self.xvec_proj[0].bias.data.fill_(0) + nn.init.xavier_uniform_(self.xvec_proj[2].weight) + self.xvec_proj[2].bias.data.fill_(0) + + def forward(self, x, vec): + vec1, vec2 = torch.split(self.vec_proj(vec), self.hidden_channels, dim=-1) + vec_dot = (vec1 * vec2).sum(dim=1) * self.inv_sqrt_h + + # NOTE: Can't use torch.norm because the gradient is NaN for input = 0. + # Add an epsilon offset to make sure sqrt is always positive. + x_vec_h = self.xvec_proj( + torch.cat([x, torch.sqrt(torch.sum(vec2**2, dim=-2) + 1e-8)], dim=-1) + ) + xvec1, xvec2, xvec3 = torch.split(x_vec_h, self.hidden_channels, dim=-1) + + dx = xvec1 + xvec2 * vec_dot + dx = dx * self.inv_sqrt_2 + + dvec = xvec3.unsqueeze(1) * vec1 + + return dx, dvec + + +class PaiNNOutput(nn.Module): + def __init__(self, hidden_channels): + super().__init__() + self.hidden_channels = hidden_channels + + self.output_network = nn.ModuleList( + [ + GatedEquivariantBlock( + hidden_channels, + hidden_channels // 2, + ), + GatedEquivariantBlock(hidden_channels // 2, 1), + ] + ) + + self.reset_parameters() + + def reset_parameters(self): + for layer in self.output_network: + layer.reset_parameters() + + def forward(self, x, vec): + for layer in self.output_network: + x, vec = layer(x, vec) + return vec.squeeze() + + +# Borrowed from TorchMD-Net +class GatedEquivariantBlock(nn.Module): + """Gated Equivariant Block as defined in Schütt et al. (2021): + Equivariant message passing for the prediction of tensorial properties and molecular spectra + """ + + def __init__( + self, + hidden_channels, + out_channels, + ): + super(GatedEquivariantBlock, self).__init__() + self.out_channels = out_channels + + self.vec1_proj = nn.Linear(hidden_channels, hidden_channels, bias=False) + self.vec2_proj = nn.Linear(hidden_channels, out_channels, bias=False) + + self.update_net = nn.Sequential( + nn.Linear(hidden_channels * 2, hidden_channels), + ScaledSiLU(), + nn.Linear(hidden_channels, out_channels * 2), + ) + + self.act = ScaledSiLU() + + def reset_parameters(self): + nn.init.xavier_uniform_(self.vec1_proj.weight) + nn.init.xavier_uniform_(self.vec2_proj.weight) + nn.init.xavier_uniform_(self.update_net[0].weight) + self.update_net[0].bias.data.fill_(0) + nn.init.xavier_uniform_(self.update_net[2].weight) + self.update_net[2].bias.data.fill_(0) + + def forward(self, x, v): + vec1 = torch.norm(self.vec1_proj(v), dim=-2) + vec2 = self.vec2_proj(v) + + x = torch.cat([x, vec1], dim=-1) + x, v = torch.split(self.update_net(x), self.out_channels, dim=-1) + v = v.unsqueeze(1) * vec2 + + x = self.act(x) + return x, v diff --git a/ocpmodels/models/painn_nb6_scaling_factors.pt b/ocpmodels/models/painn_nb6_scaling_factors.pt new file mode 100644 index 0000000000000000000000000000000000000000..3843d7c80738debaba50a0dc1318caa2041296c8 GIT binary patch literal 2199 zcmbuB&1(}u7>6fGo2IKKv1-+-t=~LkLA8=${cs zV@P)ltFd9G>N?Srt!H+*6h*7TcolpJC{Y`y@lLC*H9JIW8@jDK-ehWN2pFVEF-fgM z8g^UJ9P??XVb`^m?&!}=Vmb;0eM*SA-flYDMpl7PB2gQmk~W3uYbFe_h*GwjI(f`K zRA9Ikpfza-*FtnQS{)cs0|X+ER5UZQX&6o0HVx8fuD6wkPTh3O`u&DM6d0Yspb$8Q z;rOQIo}B1H6hqAaxiS#9f)>3iYsg~O44kw)+ZeNXoKpl&V>rX0<6St5;haA-3+E+h z7A6?<0tdZF;1Y&O2A%4{WeivRp*grJL3418L9cVr8w74*m}byK7wAj4UNKH1s~ zBIF(w-^PxjqR`l>bh)%3wYAXWN1g0`c)RSXzr-e{=t4s7ag_R=sLqs2OZ}?7+nP_9 zp8wa6&vqaF?4bTXU+bQKb#&`NAO75+{@2@v=l{6!Dc*-aKdArj3+8|KX7zU;{=%UC z?>oBp{I%(+?f?5@X!)@JHcY2Ye+kQ_;vfx}Q{3+V_I_=Jt~ul$6*b^ahkFXYDC6!0 zVaPj-oaWb6R7mbn3!hG<vpqSRm9u;=irFgtiUe;?&hmXE zW_xlrFK7845VO@`$y<=K{5KdA-gA%3?)OYZ4Top_6yw06?NFV8bJ}{*0b6@E_?cz literal 0 HcmV?d00001 From 7b1d7a68f7fba64993f99256a55783cbabd905a8 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:06:31 -0500 Subject: [PATCH 265/273] Add COMENET model --- configs/models/comenet.yaml | 62 +++++++++++++++++++++++++++++++++++++ ocpmodels/models/comenet.py | 42 +++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 configs/models/comenet.yaml create mode 100644 ocpmodels/models/comenet.py diff --git a/configs/models/comenet.yaml b/configs/models/comenet.yaml new file mode 100644 index 0000000000..392bdfdc76 --- /dev/null +++ b/configs/models/comenet.yaml @@ -0,0 +1,62 @@ +default: + model: + name: comenet + use_pbc: True + +# ------------------- +# ----- IS2RE ----- +# ------------------- + +is2re: + 10k: {} + 100k: {} + all: {} + +# ------------------ +# ----- S2EF ----- +# ------------------ + +s2ef: + default: {} + 200k: {} + 2M: {} + 20M: {} + all: {} + +qm9: + default: + model: + cutoff: 5.0 + num_layers: 5 + hidden_channels: 256 + out_channels: 1 + num_radial: 3 + num_spherical: 2 + num_output_layers: 3 + optim: + batch_size: 32 + num_workers: 4 + lr_initial: 0.001 + max_epochs: 1000 + decay_steps: 125000 + decay_rate: 0.01 + ema_decay: 0.999 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.95 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 10 + + 10k: {} + all: {} + +qm7x: + default: {} + all: {} + 1k: {} + + diff --git a/ocpmodels/models/comenet.py b/ocpmodels/models/comenet.py new file mode 100644 index 0000000000..ec8ada6e54 --- /dev/null +++ b/ocpmodels/models/comenet.py @@ -0,0 +1,42 @@ +from dig.threedgraph.method import ComENet as DIGComENet +from ocpmodels.models.base_model import BaseModel +import torch +from ocpmodels.common.registry import registry +from ocpmodels.common.utils import conditional_grad +from copy import deepcopy + + +@registry.register_model("comenet") +class ComENet(BaseModel): + def __init__(self, **kwargs): + super().__init__() + self.regress_forces = False + self.cutoff = kwargs.get("cutoff", 5.0) + self.num_layers = kwargs.get("num_layers", 4) + self.hidden_channels = kwargs.get("hidden_channels", 128) + self.out_channels = kwargs.get("out_channels", 1) + self.num_spherical = kwargs.get("num_spherical", 3) + self.num_radial = kwargs.get("num_radial", 6) + self.num_output_layers = kwargs.get("num_output_layers", 3) + self.comenet = DIGComENet( + cutoff=self.cutoff, + num_layers=self.num_layers, + hidden_channels=self.hidden_channels, + out_channels=self.out_channels, + num_spherical=self.num_spherical, + num_radial=self.num_radial, + num_output_layers=self.num_output_layers, + ) + + @conditional_grad(torch.enable_grad()) + def forces_forward(self, preds): + return + + @conditional_grad(torch.enable_grad()) + def energy_forward(self, data): + # Rewire the graph + z = data.atomic_numbers.long() + batch_data = deepcopy(data) + batch_data.z = z + + return {"energy": self.comenet.forward(batch_data)} From 16c0f9eb10b58ace9c88868c7082717e0ed94e4f Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:06:45 -0500 Subject: [PATCH 266/273] Add SPHERENET model --- configs/models/spherenet.yaml | 20 +++++++++++--------- ocpmodels/models/spherenet.py | 4 +++- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/configs/models/spherenet.yaml b/configs/models/spherenet.yaml index 1cea3c24bc..32c0d7b6df 100644 --- a/configs/models/spherenet.yaml +++ b/configs/models/spherenet.yaml @@ -43,20 +43,22 @@ qm9: out_channels: 1 out_emb_channels: 256 optim: - batch_size: 1024 + batch_size: 32 + num_workers: 4 lr_initial: 0.001 max_epochs: 1000 decay_steps: 125000 decay_rate: 0.01 ema_decay: 0.999 - lr_gamma: 0.25 - lr_milestones: - - 17981 - - 26972 - - 35963 - - 52000 - - 100000 - warmup_steps: 1000 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.95 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 10 10k: {} all: {} diff --git a/ocpmodels/models/spherenet.py b/ocpmodels/models/spherenet.py index 7a5be57181..df0024fe8e 100644 --- a/ocpmodels/models/spherenet.py +++ b/ocpmodels/models/spherenet.py @@ -6,10 +6,12 @@ from copy import deepcopy +@registry.register_model("spherenet") class SphereNet(BaseModel): def __init__(self, **kwargs): super().__init__() self.energy_and_force = kwargs.get("energy_and_force", False) + self.regress_forces = "from_energy" if self.energy_and_force else False self.cutoff = kwargs.get("cutoff", 5.0) self.num_layers = kwargs.get("num_layers", 4) self.hidden_channels = kwargs.get("hidden_channels", 128) @@ -55,4 +57,4 @@ def energy_forward(self, data): batch_data = deepcopy(data) batch_data.z = z - return self.spherenet.forward(batch_data) + return {"energy": self.spherenet.forward(batch_data)} From f8882ec4912bd72acf25b7bd1559f9c6ce238159 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:07:02 -0500 Subject: [PATCH 267/273] update dimenet (not ++) --- configs/models/dimenet.yaml | 69 ++++++++++++++++++++++++++++++++++++ ocpmodels/models/__init__.py | 2 +- ocpmodels/models/dimenet.py | 37 +++++++++++++++++-- 3 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 configs/models/dimenet.yaml diff --git a/configs/models/dimenet.yaml b/configs/models/dimenet.yaml new file mode 100644 index 0000000000..992296d8a8 --- /dev/null +++ b/configs/models/dimenet.yaml @@ -0,0 +1,69 @@ +default: + model: + name: dimenet + use_pbc: False + +# ------------------- +# ----- IS2RE ----- +# ------------------- + +is2re: + 10k: {} + 100k: {} + all: {} + +# ------------------ +# ----- S2EF ----- +# ------------------ + +s2ef: + default: {} + 200k: {} + 2M: {} + 20M: {} + all: {} + +qm9: + default: + model: + hidden_channels: 128 + out_channels: 1 + num_blocks: 6 + num_bilinear: 8 + num_spherical: 6 + num_radial: 6 + cutoff: 5.0 + max_num_neighbors: 40 + envelope_exponent: 5 + num_before_skip: 1 + num_after_skip: 2 + num_output_layers: 3 + act: swish + regress_forces: False + optim: + batch_size: 32 + num_workers: 4 + lr_initial: 0.001 + max_epochs: 1000 + decay_steps: 125000 + decay_rate: 0.01 + ema_decay: 0.999 + # all below is for the scheduler + scheduler: ReduceLROnPlateau + mode: min + factor: 0.95 + threshold: 0.0001 + threshold_mode: abs + min_lr: 0.000001 + verbose: true + patience: 10 + + 10k: {} + all: {} + +qm7x: + default: {} + all: {} + 1k: {} + + diff --git a/ocpmodels/models/__init__.py b/ocpmodels/models/__init__.py index ad3c0ccce9..9ad38b86bc 100644 --- a/ocpmodels/models/__init__.py +++ b/ocpmodels/models/__init__.py @@ -5,7 +5,7 @@ from .base_model import BaseModel # noqa: F401 from .cgcnn import CGCNN # noqa: F401 -from .dimenet import DimeNetWrap as DimeNet # noqa: F401 +from .dimenet import DimeNet # noqa: F401 from .old_dimenet_plus_plus import ( # noqa: F401 DimeNetPlusPlusWrap as OldDimeNetPlusPlus, ) diff --git a/ocpmodels/models/dimenet.py b/ocpmodels/models/dimenet.py index 341dee6da0..f74c58c644 100644 --- a/ocpmodels/models/dimenet.py +++ b/ocpmodels/models/dimenet.py @@ -6,11 +6,12 @@ """ import torch -from torch_geometric.nn import DimeNet, radius_graph +from torch_geometric.nn import DimeNet as PYGDimeNet, radius_graph from torch_scatter import scatter from torch_sparse import SparseTensor from ocpmodels.common.registry import registry +from ocpmodels.models.base_model import BaseModel from ocpmodels.common.utils import ( conditional_grad, get_pbc_distances, @@ -18,8 +19,40 @@ ) +@registry.register_model("dimenet") +class DimeNet(BaseModel): + def __init__(self, **kwargs) -> None: + super().__init__() + self.regress_forces = bool(kwargs.get("regress_forces")) + self.dimenet = PYGDimeNet( + hidden_channels=kwargs.get("hidden_channels"), + out_channels=kwargs.get("out_channels"), + num_blocks=kwargs.get("num_blocks"), + num_bilinear=kwargs.get("num_bilinear"), + num_spherical=kwargs.get("num_spherical"), + num_radial=kwargs.get("num_radial"), + cutoff=kwargs.get("cutoff"), + max_num_neighbors=kwargs.get("max_num_neighbors"), + envelope_exponent=kwargs.get("envelope_exponent"), + num_before_skip=kwargs.get("num_before_skip"), + num_after_skip=kwargs.get("num_after_skip"), + num_output_layers=kwargs.get("num_output_layers"), + act=kwargs.get("act"), + ) + + @conditional_grad(torch.enable_grad()) + def energy_forward(self, data): + return { + "energy": self.dimenet.forward(data.atomic_numbers, data.pos, data.batch) + } + + @conditional_grad(torch.enable_grad()) + def forces_forward(self, preds): + return + + @registry.register_model("old_dimenet") -class DimeNetWrap(DimeNet): +class OldDimeNetWrap(PYGDimeNet): r"""Wrapper around the directional message passing neural network (DimeNet) from the `"Directional Message Passing for Molecular Graphs" `_ paper. From fc5e6f2a3331eaf7f1d95ea43887fe6a9d1b3d1b Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:07:49 -0500 Subject: [PATCH 268/273] define `model_config`outside of model init to prevent kwarg duplication --- ocpmodels/trainers/base_trainer.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 13252ad414..3947f144a2 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -356,12 +356,18 @@ def load_model(self): if hasattr(sample, "x") and hasattr(sample.x, "shape"): num_atoms = sample.x.shape[-1] - self.model = registry.get_model_class(self.config["model_name"])( - num_atoms=num_atoms, - bond_feat_dim=bond_feat_dim, - num_targets=self.num_targets, - task_name=self.task_name, + model_config = { + **{ + "num_atoms": num_atoms, + "bond_feat_dim": bond_feat_dim, + "num_targets": self.num_targets, + "task_name": self.task_name, + }, **self.config["model"], + } + + self.model = registry.get_model_class(self.config["model_name"])( + **model_config ).to(self.device) if dist_utils.is_master() and not self.silent: From deae37669e7166a8a0566f4268661e0cfd42f8b0 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:08:18 -0500 Subject: [PATCH 269/273] enable `eval_all_splits` from a ckpt file path --- ocpmodels/trainers/base_trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py index 3947f144a2..211662a443 100644 --- a/ocpmodels/trainers/base_trainer.py +++ b/ocpmodels/trainers/base_trainer.py @@ -808,7 +808,7 @@ def save_results(self, predictions, results_file, keys): np.savez_compressed(full_path, **gather_results) def eval_all_splits( - self, final=True, disable_tqdm=True, debug_batches=-1, epoch=-1 + self, final=True, disable_tqdm=True, debug_batches=-1, epoch=-1, from_ckpt=None ): """Evaluate model on all four validation splits""" @@ -824,7 +824,9 @@ def eval_all_splits( logging.info(f"Evaluating on {len(all_splits)} val splits.") # Load current best checkpoint for final evaluation - if final and epoch != 0: + if from_ckpt: + self.load_checkpoint(checkpoint_path=from_ckpt) + elif final and epoch != 0: checkpoint_path = os.path.join( self.config["checkpoint_dir"], "best_checkpoint.pt" ) From d5d15b157c4eedfdc4e890627afa90fe16b276a6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:08:40 -0500 Subject: [PATCH 270/273] `end_of_training` from ckpt --- ocpmodels/trainers/single_trainer.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index d5a3d2a197..6ac75b81cf 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -391,9 +391,23 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1): epoch_int, debug_batches, model_run_time, epoch_times ) - def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times): + def end_of_training( + self, + epoch_int, + debug_batches, + model_run_time, + epoch_times, + from_ckpt=None, + disable_tqdm=True, + ): - eas = self.eval_all_splits(True, epoch=epoch_int, debug_batches=debug_batches) + eas = self.eval_all_splits( + True, + epoch=epoch_int, + debug_batches=debug_batches, + from_ckpt=from_ckpt, + disable_tqdm=disable_tqdm, + ) if eas == "SIGTERM": return "SIGTERM" From e930138c28b4772579d91f746e89fabeffd030f6 Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:08:56 -0500 Subject: [PATCH 271/273] print symmetry results if not silent --- ocpmodels/trainers/single_trainer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py index 6ac75b81cf..c9be544f15 100644 --- a/ocpmodels/trainers/single_trainer.py +++ b/ocpmodels/trainers/single_trainer.py @@ -438,6 +438,8 @@ def end_of_training( return "SIGTERM" if self.logger: self.logger.log(symmetry) + if not self.silent: + print(symmetry) # TODO: Test equivariance From 7765e6eb659989d399edb234a0222912aba7212c Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:09:10 -0500 Subject: [PATCH 272/273] rename former eval script --- ...odels_on_all_splits.py => legacy_eval_models_on_all_splits.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/{eval_models_on_all_splits.py => legacy_eval_models_on_all_splits.py} (100%) diff --git a/scripts/eval_models_on_all_splits.py b/scripts/legacy_eval_models_on_all_splits.py similarity index 100% rename from scripts/eval_models_on_all_splits.py rename to scripts/legacy_eval_models_on_all_splits.py From d5afd6b5e5ef5a2f919e8685f72fb88507cf138d Mon Sep 17 00:00:00 2001 From: Victor Schmidt Date: Wed, 8 Feb 2023 12:09:33 -0500 Subject: [PATCH 273/273] new eval script with `continue_from_dir` and `end_of_training` --- scripts/eval_model.py | 56 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scripts/eval_model.py diff --git a/scripts/eval_model.py b/scripts/eval_model.py new file mode 100644 index 0000000000..bcf2ef7502 --- /dev/null +++ b/scripts/eval_model.py @@ -0,0 +1,56 @@ +import sys +from copy import deepcopy +from pathlib import Path + +from minydra import resolved_args + +sys.path.append(str(Path(__file__).resolve().parent.parent)) + +from ocpmodels.common.flags import flags +from ocpmodels.common.utils import build_config, resolve, setup_imports, merge_dicts +from ocpmodels.trainers.single_trainer import SingleTrainer + +if __name__ == "__main__": + + args = resolved_args( + defaults={ + "job_id": None, + "dir": None, + "config": {}, + }, + strict=False, + ) + assert ( + args.job_id is not None or args.dir is not None + ), "Must specify either job_id or dir." + + path = ( + resolve(args.dir) + if args.dir is not None + else resolve("$SCRATCH/ocp/runs") / str(args.job_id) + ) + + setup_imports() + argv = deepcopy(sys.argv) + sys.argv[1:] = [] + trainer_args = flags.parser.parse_args() + sys.argv[1:] = argv + trainer_args.continue_from_dir = str(path) + config = build_config(trainer_args, []) + config["logger"] = "dummy" + config["checkpoint"] = str(path / "checkpoints" / "best_checkpoint.pt") + config = merge_dicts(config, args.config) + + trainer = SingleTrainer(**config) + + trainer.silent = False + trainer.eval_on_test = True + + trainer.end_of_training( + -1, + -1, + -1, + [-1], + from_ckpt=config["checkpoint"], + disable_tqdm=False, + )