From b6f1dd73487083524bc24acd369f13b61aabeced Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:17:52 -0500
Subject: [PATCH 001/273] print when saving checkpoint

---
 ocpmodels/common/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 2951078571..1938cf98b8 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -343,6 +343,7 @@ def save_checkpoint(
     state, checkpoint_dir="checkpoints/", checkpoint_file="checkpoint.pt"
 ):
     filename = os.path.join(checkpoint_dir, checkpoint_file)
+    print(f"Saving checkpoint to {filename}")
     torch.save(state, filename)
 
 
From 5421e5972fda81423e43b317db96ef401131b5aa Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:18:35 -0500
Subject: [PATCH 002/273] log every n steps

---
 ocpmodels/common/flags.py            |  6 ++++++
 ocpmodels/trainers/single_trainer.py | 10 ++++++----
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 018b9dbe48..92dbb14421 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -241,6 +241,12 @@ def add_core_args(self):
             default=False,
             help="Don't copy LMDB data to $SLURM_TMPDIR and work from there",
         )
+        self.parser.add_argument(
+            "--log_train_every",
+            type=int,
+            default=100,
+            help="Log training loss every n steps",
+        )
 
 
 flags = Flags()
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index b6d1ae3c29..dc87854fee 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -247,10 +247,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                     metrics={},
                 )
                 scale = self.scaler.get_scale() if self.scaler else 1.0
-                for k, v in loss.items():
-                    self.metrics = self.evaluator.update(
-                        k, v.item() / scale, self.metrics
-                    )
+
+                if i_for_epoch % log_train_every == 0:
+                    for k, v in loss.items():
+                        self.metrics = self.evaluator.update(
+                            k, v.item() / scale, self.metrics
+                        )
 
                 # Log metrics.
                 self.log_train_metrics()

From 46c8cdd7cd6ff2c53ecfc2dd53aa91dfc5ce21bd Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:18:45 -0500
Subject: [PATCH 003/273] add timing class

---
 ocpmodels/common/timer.py | 87 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 ocpmodels/common/timer.py

diff --git a/ocpmodels/common/timer.py b/ocpmodels/common/timer.py
new file mode 100644
index 0000000000..cb38f1c731
--- /dev/null
+++ b/ocpmodels/common/timer.py
@@ -0,0 +1,87 @@
+import torch
+from time import time, sleep
+from collections import defaultdict
+import numpy as np
+
+
+class Timer:
+    def __init__(self, name, store={}, gpu=False, ignore=False):
+        self.times = store
+        self.name = name
+        self.gpu = gpu
+        self.ignore = ignore
+
+    def __enter__(self):
+        if self.ignore:
+            return self
+        if self.gpu:
+            self.start = torch.cuda.Event(enable_timing=True)
+            self.end = torch.cuda.Event(enable_timing=True)
+            self.start.record()
+        else:
+            self.start = time()
+        return self
+
+    def __exit__(self, *args):
+        if self.ignore:
+            return
+        if self.gpu:
+            self.end.record()
+            torch.cuda.synchronize()
+            self.duration = self.start.elapsed_time(self.end) / 1000
+        else:
+            self.end = time()
+            self.duration = self.end - self.start
+        self.times[self.name].append(self.duration)
+
+
+class Times:
+    def __init__(self, gpu=False, ignore=False):
+        self.times = defaultdict(list)
+        self.timers = {}
+        self.gpu = gpu
+        self.ignore = ignore
+
+    def reset(self):
+        self.times = defaultdict(list)
+        self.timers = {}
+
+    def prepare_for_logging(self):
+        """
+        Computes mean and standard deviation of all timers.
+        Returns a tuple: (mean_times_dict, std_times_dict)
+
+        Returns:
+            tuple[dict]: a dict with mean times and a dict with std times
+        """
+        mean_times = {}
+        std_times = {}
+        for k, v in self.times.items():
+            mean_times[k] = np.mean(v)
+            std_times[k] = np.std(v)
+        return mean_times, std_times
+
+    def next(self, name, ignore=None):
+        if "name" not in self.timers:
+            if ignore is None:
+                ignore = self.ignore
+            self.timers[name] = Timer(name, self.times, self.gpu, ignore)
+        return self.timers[name]
+
+
+if __name__ == "__main__":
+
+    times = Times(gpu=True)
+    with times.next("a"):
+        sleep(0.1)
+    with times.next("b"):
+        sleep(0.2)
+    with times.next("a"):
+        sleep(0.3)
+    with times.next("b"):
+        sleep(0.4)
+    with times.next("a"):
+        sleep(0.5)
+    with times.next("b"):
+        sleep(0.6)
+    print(times.prepare_for_logging())

From 1513bacc4296a6282bb02575fd42bfef061b193a Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:18:52 -0500
Subject: [PATCH 004/273] time validation

---
 ocpmodels/trainers/base_trainer.py | 48 +++++++++++++++++-------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index e983155c42..28b24a80df 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -44,6 +44,7 @@
 from ocpmodels.modules.loss import DDPLoss, L2MAELoss
 from ocpmodels.modules.normalizer import Normalizer
 from ocpmodels.modules.scheduler import LRScheduler
+from ocpmodels.common.timer import Times
 
 
 @registry.register_trainer("base")
@@ -534,6 +535,7 @@ def validate(
         disable_tqdm=True,
         debug_batches=-1,
         is_final=False,
+        is_first=False,
     ):
         if distutils.is_master() and not self.silent:
             print()
@@ -554,30 +556,34 @@ def validate(
         desc = "device {}".format(distutils.get_rank())
 
         loader = self.loaders[split]
-        val_time = time.time()
+        times = Times(gpu=True)
 
-        for i, batch in enumerate(tqdm(loader, desc=desc, disable=disable_tqdm)):
+        with times.next("validation_loop"):
 
-            if self.sigterm:
-                return "SIGTERM"
+            for i, batch in enumerate(tqdm(loader, desc=desc, disable=disable_tqdm)):
+
+                if self.sigterm:
+                    return "SIGTERM"
+
+                if debug_batches > 0 and i == debug_batches:
+                    break
 
-            if debug_batches > 0 and i == debug_batches:
-                break
+                # Forward.
+                with torch.cuda.amp.autocast(enabled=self.scaler is not None):
+                    with times.next("model_forward", ignore=not is_first):
+                        preds = self.model_forward(batch)
+                    loss = self.compute_loss(preds, batch)
 
-            # Forward.
-            with torch.cuda.amp.autocast(enabled=self.scaler is not None):
-                preds = self.model_forward(batch)
+                if preds.get("pooling_loss") is not None:
+                    loss["total_loss"] += preds["pooling_loss"]
 
-            loss = self.compute_loss(preds, batch)
-            if preds.get("pooling_loss") is not None:
-                loss["total_loss"] += preds["pooling_loss"]
+                # Compute metrics.
+                metrics = self.compute_metrics(preds, batch, evaluator, metrics)
+                for k, v in loss.items():
+                    metrics = evaluator.update(k, v.item(), metrics)
 
-            # Compute metrics.
-            metrics = self.compute_metrics(preds, batch, evaluator, metrics)
-            for k, v in loss.items():
-                metrics = evaluator.update(k, v.item(), metrics)
+        mean_val_times, std_val_times = times.prepare_for_logging()
 
-        val_time = time.time() - val_time
         aggregated_metrics = {}
         for k in metrics:
             aggregated_metrics[k] = {
@@ -594,9 +600,11 @@ def validate(
         metrics = aggregated_metrics
 
         log_dict = {k: metrics[k]["metric"] for k in metrics}
-        log_dict.update({"epoch": self.epoch})
-        log_dict.update({f"{split}_time": val_time})
-        log_dict.update({f"{split}_n_samples": i + 1})
+        log_dict["epoch"] = self.epoch
+        log_dict[f"{split}_time"] = mean_val_times["validation_loop"]
+        if is_first:
+            log_dict["model_forward_time_mean"] = mean_val_times["model_forward"]
+            log_dict["model_forward_time_std"] = std_val_times["model_forward"]
 
         if distutils.is_master() and not self.silent:
             log_str = ["{}: {:.4f}".format(k, v) for k, v in log_dict.items()]

From 0dcea16185f70eb2fb4e5b9ff32810412f313709 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:18:57 -0500
Subject: [PATCH 005/273] time batch retrieval

---
 ocpmodels/trainers/single_trainer.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index dc87854fee..6f8c3a2ceb 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -25,6 +25,7 @@
 from ocpmodels.modules.evaluator import Evaluator
 from ocpmodels.modules.normalizer import Normalizer
 from ocpmodels.trainers.base_trainer import BaseTrainer
+from ocpmodels.common.timer import Times
 
 is_test_env = os.environ.get("ocp_test_env", False)
 
@@ -190,10 +191,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         )
         self.best_val_metric = np.inf
         current_val_metric = None
+        first_eval = True
 
         # Calculate start_epoch from step instead of loading the epoch number
         # to prevent inconsistencies due to different batch size in checkpoint.
         start_epoch = self.step // n_train
+        loader_times = Times()
         epoch_times = []
 
         if not self.silent:
@@ -211,6 +214,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
             train_loader_iter = iter(self.loaders["train"])
             self.model.train()
             i_for_epoch = 0
+            log_train_every = self.config["log_train_every"]
 
             for i in range(skip_steps, n_train):
                 if self.sigterm:
@@ -220,7 +224,8 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 self.step = epoch_int * n_train + i + 1
 
                 # Get a batch.
-                batch = next(train_loader_iter)
+                with loader_times.time("get_batch"):
+                    batch = next(train_loader_iter)
 
                 # Forward, loss, backward.
                 with torch.cuda.amp.autocast(enabled=self.scaler is not None):
@@ -229,10 +234,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                     if preds.get("pooling_loss") is not None:
                         coeff = self.config["optim"].get("pooling_coefficient", 1)
                         loss["total_loss"] += preds["pooling_loss"] * coeff
+
                 loss = {
                     k: self.scaler.scale(v) if self.scaler else v
                     for k, v in loss.items()
                 }
+
                 if torch.isnan(loss["total_loss"]):
                     print("\n\n >>> 🛑 Loss is NaN. Stopping training.\n\n")
                     self.logger.add_tags(["nan_loss"])
@@ -254,8 +261,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                             k, v.item() / scale, self.metrics
                         )
 
-                # Log metrics.
-                self.log_train_metrics()
+                    # Log metrics.
+                    gbm, gbs = loader_times.prepare_for_logging()
+                    self.metrics["get_batch_time_mean"] = {"metric": gbm["get_batch"]}
+                    self.metrics["get_batch_time_std"] = {"metric": gbs["get_batch"]}
+                    loader_times.reset()
+                    self.log_train_metrics()
 
                 is_final_epoch = epoch_int == self.config["optim"]["max_epochs"] - 1
                 is_final_batch = (i == n_train - 1) or (
@@ -283,7 +294,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                         split=self.config["dataset"]["default_val"],
                         disable_tqdm=disable_eval_tqdm,
                         debug_batches=debug_batches,
+                        is_first=first_eval,
                     )
+                    first_eval = False
                     if val_metrics == "SIGTERM":
                         return "SIGTERM"
                     current_val_metric = val_metrics[primary_metric]["metric"]

From 3c41f67bbd5f74af9024a70866740874ac1c5aeb Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:21:14 -0500
Subject: [PATCH 006/273] typo

---
 ocpmodels/trainers/single_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 6f8c3a2ceb..6d6c24a916 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -224,7 +224,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 self.step = epoch_int * n_train + i + 1
 
                 # Get a batch.
-                with loader_times.time("get_batch"):
+                with loader_times.next("get_batch"):
                     batch = next(train_loader_iter)
 
                 # Forward, loss, backward.

From 64180f2f85241a856b9ba6a2a5d45b66f9002084 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:32:08 -0500
Subject: [PATCH 007/273] remove `energy_within_threshold` from qm9 and qm7x

---
 ocpmodels/modules/evaluator.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ocpmodels/modules/evaluator.py b/ocpmodels/modules/evaluator.py
index 38f1833295..da01447b3d 100644
--- a/ocpmodels/modules/evaluator.py
+++ b/ocpmodels/modules/evaluator.py
@@ -56,12 +56,10 @@ class Evaluator:
         "qm9": [
             "energy_mae",
             "energy_mse",
-            "energy_within_threshold",
         ],
         "qm7x": [
             "energy_mae",
             "energy_mse",
-            "energy_within_threshold",
         ],
     }
 

From c1b844622aeb20037d3d6207ff8a436049750c5d Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:32:28 -0500
Subject: [PATCH 008/273] initialize `signal` to `None`

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index e561a088e6..1d6a2b842d 100644
--- a/main.py
+++ b/main.py
@@ -134,7 +134,7 @@ def print_warnings():
 
 
 if __name__ == "__main__":
-    ntfy = trainer = error = None
+    ntfy = trainer = error = signal = None
 
     setup_logging()
 

From f5ca589ec067a3aa6520c7b85aa2a2ee797d7211 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:32:34 -0500
Subject: [PATCH 009/273] log epoch time

---
 ocpmodels/trainers/single_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 6d6c24a916..23acb1f513 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -317,9 +317,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 # End of batch.
 
             # End of epoch.
+            epoch_times.append(time.time() - start_time)
+            self.metrics["epoch_time"] = {"metric": epoch_times[-1]}
             self.log_train_metrics(end_of_epoch=True)
             torch.cuda.empty_cache()
-            epoch_times.append(time.time() - start_time)
 
         # End of training.
 

From 55471ab24cccc2ad2adcf4b47b4fc8ada6509947 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 18:59:00 -0500
Subject: [PATCH 010/273] fix log epoch_time

---
 ocpmodels/common/logger.py           | 4 +++-
 ocpmodels/trainers/single_trainer.py | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index f526f686b7..1e329d59f0 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -91,7 +91,9 @@ class WandBLogger(Logger):
     def __init__(self, trainer_config):
         super().__init__(trainer_config)
 
-        wandb_id = ""
+        wandb_id = str(self.trainer_config.get("wandb_id", ""))
+        if wandb_id:
+            wandb_id += " - "
         slurm_jobid = os.environ.get("SLURM_JOB_ID")
         if slurm_jobid:
             wandb_id += f"{slurm_jobid}-"
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 23acb1f513..7c0137a6e3 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -583,14 +583,14 @@ def log_train_metrics(self, end_of_epoch=False):
             and distutils.is_master()
             and not self.is_hpo
         ) or (distutils.is_master() and end_of_epoch):
-            log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()]
             if not self.silent:
+                log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()]
                 print(
                     f"Train metrics at step {self.step}:\n  > " + "\n  > ".join(log_str)
                 )
             self.metrics = {}
 
-        if self.logger is not None and not end_of_epoch:
+        if self.logger is not None:  # and not end_of_epoch:
             self.logger.log(
                 log_dict,
                 step=self.step,

From 27f540e23e0d6a9e3b759a48c5fba3bf9b6d3e70 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 19:23:42 -0500
Subject: [PATCH 011/273] print log_train_every

---
 ocpmodels/trainers/single_trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 7c0137a6e3..38afa7d9ce 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -192,6 +192,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         self.best_val_metric = np.inf
         current_val_metric = None
         first_eval = True
+        log_train_every = self.config["log_train_every"]
+
+        print("Logging train metrics every {} steps".format(log_train_every))
 
         # Calculate start_epoch from step instead of loading the epoch number
         # to prevent inconsistencies due to different batch size in checkpoint.
@@ -214,7 +217,6 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
             train_loader_iter = iter(self.loaders["train"])
             self.model.train()
             i_for_epoch = 0
-            log_train_every = self.config["log_train_every"]
 
             for i in range(skip_steps, n_train):
                 if self.sigterm:

From ca4a173c63643b79f324f3545d52c2094d7f934b Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 20:01:31 -0500
Subject: [PATCH 012/273] fix print-every

---
 ocpmodels/common/flags.py            | 11 ++---------
 ocpmodels/trainers/single_trainer.py |  3 ++-
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 92dbb14421..f487115b31 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -64,9 +64,9 @@ def add_core_args(self):
         )
         self.parser.add_argument(
             "--print-every",
-            default=1000,
+            default=-1,
             type=int,
-            help="Log every N iterations (default: 10)",
+            help="Log every N iterations (default: -1 = end of epoch)",
         )
         self.parser.add_argument(
             "--seed", default=0, type=int, help="Seed for torch, cuda, numpy"
@@ -172,13 +172,6 @@ def add_core_args(self):
             default="",
             help="Comma-separated tags for wandb",
         )
-        self.parser.add_argument(
-            "--print_every",
-            type=int,
-            default=-1,
-            help="Printing frequency (in steps). "
-            + "Default (-1) prints at the end of the epoch.",
-        )
         self.parser.add_argument(
             "--wandb_project",
             type=str,
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 38afa7d9ce..9d3ac2d811 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -194,7 +194,8 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         first_eval = True
         log_train_every = self.config["log_train_every"]
 
-        print("Logging train metrics every {} steps".format(log_train_every))
+        print(f"Logging  train metrics every {log_train_every} steps")
+        print(f"Printing train metrics every {self.config['print_every']} steps")
 
         # Calculate start_epoch from step instead of loading the epoch number
         # to prevent inconsistencies due to different batch size in checkpoint.

From 36d1eda51f24b64ff83c6cdf8b153f29bed35e91 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 4 Jan 2023 21:09:42 -0500
Subject: [PATCH 013/273] comment out step print

---
 ocpmodels/trainers/single_trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 9d3ac2d811..388afa127c 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -269,6 +269,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                     self.metrics["get_batch_time_mean"] = {"metric": gbm["get_batch"]}
                     self.metrics["get_batch_time_std"] = {"metric": gbs["get_batch"]}
                     loader_times.reset()
+                    # logging.info(f"Step: {self.step}")
                     self.log_train_metrics()
 
                 is_final_epoch = epoch_int == self.config["optim"]["max_epochs"] - 1

From fbeb9f46764398ae11809617bbdc6f4c67a222b3 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 5 Jan 2023 00:02:26 -0500
Subject: [PATCH 014/273] write summary yaml

---
 launch_exp.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/launch_exp.py b/launch_exp.py
index f1e043c7e1..16f5932b2b 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -58,6 +58,28 @@ def merge_dicts(dict1: dict, dict2: dict):
     return return_dict
 
 
+def write_exp_yaml_and_jobs(exp_file, outfile, jobs):
+    """
+    Reads the exp_file, adds the jobs as comments in each run line and writes the
+    resulting yaml file in the same directory as the outfile.
+
+    Args:
+        exp_file (Path): Path to the experimental yaml file
+        outfile (Path): Path to the output txt file
+        jobs (list[str]): List of jobs, one per run line in the yaml exp_file
+    """
+    lines = exp_file.read_text().splitlines()
+    run_line = lines.index("runs:")
+    j = 0
+    for i, line in enumerate(lines[run_line:]):
+        if line.strip().startswith("- "):
+            lines[run_line + i] = f"{line}  # {jobs[j]}"
+            j += 1
+    yml_out = outfile.with_suffix(".yaml")
+    yml_out.write_text("\n".join(lines))
+    return yml_out
+
+
 def get_commit():
     try:
         commit = (
@@ -173,5 +195,9 @@ def cli_arg(args, key=""):
             f.write(text)
         print(f"Output written to {str(outfile)}")
         print("All job launched:", " ".join(jobs))
+        yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs)
+        print(
+            "Experiment summary YAML in ", f"./{str(yml_out.relative_to(Path.cwd()))}"
+        )
     else:
         print("Aborting")

From 22ef8240584e50ff261b8745c35081010dc6643a Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 5 Jan 2023 00:02:53 -0500
Subject: [PATCH 015/273] update exp based on results from `2624343` and
 `2623710`

---
 configs/exps/qm7x/schnet.yaml | 83 ++++++++++++-----------------------
 1 file changed, 28 insertions(+), 55 deletions(-)

diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml
index da55e6304d..1715b989cf 100644
--- a/configs/exps/qm7x/schnet.yaml
+++ b/configs/exps/qm7x/schnet.yaml
@@ -8,6 +8,7 @@ job:
   env: ocp-a100
 
 default:
+  config: schnet-qm7x-all
   wandb_project: ocp-qm
   mode: train
   test_ri: true
@@ -31,7 +32,7 @@ default:
     val_ood:
       std_divider: 10.0
   optim:
-    batch_size: 32
+    batch_size: 1024
     warmup_steps: 3000
     lr_initial: 0.0005
     # parameters EMA
@@ -39,62 +40,34 @@ default:
     decay_steps: 750000
     decay_rate: 0.05
     max_steps: 1000000
+  model:
+    hidden_channels: 256
+    num_filters: 256
+    num_gaussians: 100
+    num_interactions: 6
+    cutoff: 5.0
 
 runs:
-  - config: schnet-qm7x-all
-    model:
-      hidden_channels: 128
-      num_gaussians: 20
-      num_filters: 128
-      num_interactions: 6
-      cutoff: 5.0
-
-
-  - config: schnet-qm7x-all
-    model:
-      hidden_channels: 256
-      num_gaussians: 20
-      num_filters: 256
-      num_interactions: 6
-      cutoff: 5.0
-
-  - config: schnet-qm7x-all
-    model:
-      hidden_channels: 256
-      num_gaussians: 20
-      num_filters: 256
-      num_interactions: 6
-      cutoff: 5.0
-    optim:
+  - {}
+  - optim:
+      batch_size: 2048
+  - optim:
+      batch_size: 4096
+  - optim:
       lr_initial: 0.001
-
-  - config: schnet-qm7x-all
-    model:
-      hidden_channels: 512
-      num_gaussians: 20
-      num_filters: 256
-      num_interactions: 6
-      cutoff: 5.0
-    optim:
-      lr_initial: 0.005
-
-  - config: schnet-qm7x-all
-    model:
-      hidden_channels: 128
-      num_gaussians: 20
-      num_filters: 128
+  - optim:
+      lr_initial: 0.001
+      batch_size: 2048
+  - model:
+      num_gaussians: 200
+  - model:
+      hidden_channels: 1024
+  - model:
+      num_filters: 1024
+  - model:
+      num_interactions: 8
+  - model:
       num_interactions: 6
-      cutoff: 5.0
-    optim:
-      lr_initial: 0.0002
-
-  - config: schnet-qm7x-all
-    model:
-      hidden_channels: 512
       num_gaussians: 20
-      num_filters: 512
-      num_interactions: 6
-      cutoff: 5.0
-    optim:
-      batch_size: 128
-      lr_initial: 0.0001
\ No newline at end of file
+      num_filters: 64
+      hidden_channels: 1024
\ No newline at end of file

From 45904b20470befafcd7dbd0d238e365f3eaf3210 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 5 Jan 2023 00:18:35 -0500
Subject: [PATCH 016/273] v0 fanets QMs

---
 configs/exps/qm7x/fanet.yaml | 106 +++++++++++++++++++++++++++++++++++
 configs/exps/qm9/fanet.yaml  |  99 ++++++++++++++++++++++++++++++++
 2 files changed, 205 insertions(+)
 create mode 100644 configs/exps/qm7x/fanet.yaml
 create mode 100644 configs/exps/qm9/fanet.yaml

diff --git a/configs/exps/qm7x/fanet.yaml b/configs/exps/qm7x/fanet.yaml
new file mode 100644
index 0000000000..0dd40f39c1
--- /dev/null
+++ b/configs/exps/qm7x/fanet.yaml
@@ -0,0 +1,106 @@
+# trainset has 4068193 samples
+job:
+  mem: 48GB
+  cpus: 8
+  gres: gpu:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
+  env: ocp-a100
+
+default:
+  config: fanet-qm7x-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x, std/10
+  frame_averaging: ""
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, mp_type, edge_embed_type
+    optim: batch_size, lr_initial
+  dataset:
+    train:
+      std_divider: 10.0
+    val_id:
+      std_divider: 10.0
+    val_ood:
+      std_divider: 10.0
+  optim:
+    batch_size: 1024
+    warmup_steps: 3000
+    lr_initial: 0.0005
+    # parameters EMA
+    ema_decay: 0.999
+    decay_steps: 750000
+    decay_rate: 0.05
+    max_steps: 1000000
+  model:
+    # PhAST
+    phys_embeds: False
+    phys_hidden_channels: 0
+    energy_head: False # "weighted-av-init-embeds", "weighted-av-final-embeds"
+    pg_hidden_channels: 0
+    tag_hidden_channels: 0
+    # archi
+    hidden_channels: 256
+    num_filters: 256
+    num_gaussians: 32
+    num_interactions: 4
+    cutoff: 6.0
+    regress_forces: False
+    # fanet
+    skip_co: False # output skip connections
+    second_layer_MLP: False # in EmbeddingBlock
+    edge_embed_type: rij # {'rij','all_rij','sh', 'all'})
+    mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}
+    force_decoder_type: "simple" # can be {"" or "simple"} | only used if regress_forces is True
+    force_decoder_model_config:
+      simple:
+        hidden_channels: 128
+
+runs:
+  - {}
+  - model:
+      pg_hidden_channels: 64
+  - model:
+      energy_head: "weighted-av-init-embeds"
+  - model:
+      phys_embeds: True
+  - model:
+      pg_hidden_channels: 64
+      phys_embeds: True
+      energy_head: "weighted-av-init-embeds"
+  - model:
+      mp_type: base
+      edge_embed_type: all_rij
+  - model:
+      mp_type: base
+      edge_embed_type: sh
+  - model:
+      mp_type: base
+      edge_embed_type: all
+  - model:
+      mp_type: simple
+      edge_embed_type: rij
+  - model:
+      mp_type: simple
+      edge_embed_type: all_rij
+  - model:
+      mp_type: simple
+      edge_embed_type: sh
+  - model:
+      mp_type: simple
+      edge_embed_type: all
+  - model:
+      mp_type: updownscale
+      edge_embed_type: rij
+  - model:
+      mp_type: updownscale
+      edge_embed_type: all_rij
+  - model:
+      mp_type: updownscale
+      edge_embed_type: sh
+  - model:
+      mp_type: updownscale
+      edge_embed_type: all
diff --git a/configs/exps/qm9/fanet.yaml b/configs/exps/qm9/fanet.yaml
new file mode 100644
index 0000000000..b258c5c3df
--- /dev/null
+++ b/configs/exps/qm9/fanet.yaml
@@ -0,0 +1,99 @@
+# trainset has 4068193 samples
+job:
+  mem: 48GB
+  cpus: 8
+  gres: gpu:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
+  env: ocp-a100
+
+default:
+  config: fanet-qm9-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, std/10
+  frame_averaging: ""
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions
+    optim: batch_size, lr_initial
+  optim:
+    batch_size: 1024
+    warmup_steps: 3000
+    lr_initial: 0.0005
+    # parameters EMA
+    ema_decay: 0.999
+    decay_steps: 750000
+    decay_rate: 0.05
+    max_steps: 1000000
+  model:
+    # PhAST
+    phys_embeds: False
+    phys_hidden_channels: 0
+    energy_head: False # "weighted-av-init-embeds", "weighted-av-final-embeds"
+    pg_hidden_channels: 0
+    tag_hidden_channels: 0
+    # archi
+    hidden_channels: 256
+    num_filters: 256
+    num_gaussians: 32
+    num_interactions: 4
+    cutoff: 6.0
+    regress_forces: False
+    # fanet
+    skip_co: False # output skip connections
+    second_layer_MLP: False # in EmbeddingBlock
+    edge_embed_type: rij # {'rij','all_rij','sh', 'all'})
+    mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}
+    force_decoder_type: "simple" # can be {"" or "simple"} | only used if regress_forces is True
+    force_decoder_model_config:
+      simple:
+        hidden_channels: 128
+
+runs:
+  - {}
+  - model:
+      pg_hidden_channels: 64
+  - model:
+      energy_head: "weighted-av-init-embeds"
+  - model:
+      phys_embeds: True
+  - model:
+      pg_hidden_channels: 64
+      phys_embeds: True
+      energy_head: "weighted-av-init-embeds"
+  - model:
+      mp_type: base
+      edge_embed_type: all_rij
+  - model:
+      mp_type: base
+      edge_embed_type: sh
+  - model:
+      mp_type: base
+      edge_embed_type: all
+  - model:
+      mp_type: simple
+      edge_embed_type: rij
+  - model:
+      mp_type: simple
+      edge_embed_type: all_rij
+  - model:
+      mp_type: simple
+      edge_embed_type: sh
+  - model:
+      mp_type: simple
+      edge_embed_type: all
+  - model:
+      mp_type: updownscale
+      edge_embed_type: rij
+  - model:
+      mp_type: updownscale
+      edge_embed_type: all_rij
+  - model:
+      mp_type: updownscale
+      edge_embed_type: sh
+  - model:
+      mp_type: updownscale
+      edge_embed_type: all

From b09db3881a77026d02c55be6d7e56c8afa98a8bb Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 5 Jan 2023 11:22:49 -0500
Subject: [PATCH 017/273] add min 16GB memory in GPU gres

---
 configs/exps/qm7x/fanet.yaml    | 2 +-
 configs/exps/qm7x/schnet.yaml   | 2 +-
 configs/exps/qm9/baselines.yaml | 2 +-
 configs/exps/qm9/fanet.yaml     | 2 +-
 configs/exps/qm9/sfarinet.yaml  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/exps/qm7x/fanet.yaml b/configs/exps/qm7x/fanet.yaml
index 0dd40f39c1..627caca163 100644
--- a/configs/exps/qm7x/fanet.yaml
+++ b/configs/exps/qm7x/fanet.yaml
@@ -2,7 +2,7 @@
 job:
   mem: 48GB
   cpus: 8
-  gres: gpu:1
+  gres: gpu:16gb:1
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
   env: ocp-a100
diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml
index 1715b989cf..65d4d6654f 100644
--- a/configs/exps/qm7x/schnet.yaml
+++ b/configs/exps/qm7x/schnet.yaml
@@ -2,7 +2,7 @@
 job:
   mem: 48GB
   cpus: 8
-  gres: gpu:1
+  gres: gpu:16gb:1
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
   env: ocp-a100
diff --git a/configs/exps/qm9/baselines.yaml b/configs/exps/qm9/baselines.yaml
index 2cafb1c69e..f43d553446 100644
--- a/configs/exps/qm9/baselines.yaml
+++ b/configs/exps/qm9/baselines.yaml
@@ -2,7 +2,7 @@
 job:
   mem: 48GB
   cpus: 4
-  gres: gpu:1
+  gres: gpu:16gb:1
   time: 24:00:00
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
diff --git a/configs/exps/qm9/fanet.yaml b/configs/exps/qm9/fanet.yaml
index b258c5c3df..fedb248329 100644
--- a/configs/exps/qm9/fanet.yaml
+++ b/configs/exps/qm9/fanet.yaml
@@ -2,7 +2,7 @@
 job:
   mem: 48GB
   cpus: 8
-  gres: gpu:1
+  gres: gpu:16gb:1
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
   env: ocp-a100
diff --git a/configs/exps/qm9/sfarinet.yaml b/configs/exps/qm9/sfarinet.yaml
index 701a760143..2f3a3104cb 100644
--- a/configs/exps/qm9/sfarinet.yaml
+++ b/configs/exps/qm9/sfarinet.yaml
@@ -1,7 +1,7 @@
 job:
   mem: 48GB
   cpus: 4
-  gres: gpu:1
+  gres: gpu:16gb:1
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
   env: ocp-a100

From 6924a32e7f99dd0e7c6612894e00109d289679b8 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 5 Jan 2023 12:22:46 -0500
Subject: [PATCH 018/273] more info when overriding max_epochs

---
 ocpmodels/trainers/base_trainer.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 28b24a80df..c188f0d32b 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -215,6 +215,7 @@ def load_datasets(self):
 
         transform = get_transforms(self.config)  # TODO: train/val/test behavior
         batch_size = self.config["optim"]["batch_size"]
+        max_steps = self.config["optim"].get("max_steps", -1)
 
         for split, ds_conf in self.config["dataset"].items():
             if split == "default_val":
@@ -227,23 +228,21 @@ def load_datasets(self):
             shuffle = False
             if split == "train":
                 shuffle = True
-                if self.config["optim"].get("max_steps"):
+                if max_steps > 0:
                     if self.config["optim"].get("max_epochs", -1) > 0:
                         print(
                             "WARNING: Both max_steps and max_epochs are set.",
                             "Using max_steps.",
                         )
                     self.config["optim"]["max_epochs"] = int(
-                        np.ceil(
-                            self.config["optim"]["max_steps"]
-                            / np.ceil(len(self.datasets[split]) / batch_size)
-                        )
+                        np.ceil(max_steps / (len(self.datasets[split]) / batch_size))
                     )
                     print(
                         "Setting max_epochs to",
                         self.config["optim"]["max_epochs"],
-                        f"from max_steps ({self.config['optim']['max_steps']})",
-                        f"and batch_size ({self.config['optim']['batch_size']})\n",
+                        f"from max_steps ({max_steps}),",
+                        f"dataset length ({len(self.datasets[split])}),",
+                        f"and batch_size ({batch_size})\n",
                     )
 
             self.samplers[split] = self.get_sampler(

From 8314c023c4a70e44f36ea22097ec280e8d2418e0 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Fri, 6 Jan 2023 06:18:13 -0500
Subject: [PATCH 019/273] update configs

---
 configs/exps/gnn/test-gnn-1.yaml              | 44 ++++++++++
 configs/exps/gnn/test-gnn-all-1.yaml          | 31 +++++++
 configs/exps/prop-check/symmetries.yaml       | 59 +++++++++++--
 configs/exps/prop-check/symmetries_is2re.yaml | 37 ++++----
 configs/models/fanet.yaml                     | 84 ++++++++++++++-----
 ocpmodels/models/fanet.py                     |  2 +-
 6 files changed, 206 insertions(+), 51 deletions(-)
 create mode 100644 configs/exps/gnn/test-gnn-1.yaml
 create mode 100644 configs/exps/gnn/test-gnn-all-1.yaml

diff --git a/configs/exps/gnn/test-gnn-1.yaml b/configs/exps/gnn/test-gnn-1.yaml
new file mode 100644
index 0000000000..223f94d929
--- /dev/null
+++ b/configs/exps/gnn/test-gnn-1.yaml
@@ -0,0 +1,44 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+  optim:
+    lr_initial: 0.002
+    lr_gamma: 0.07
+    warmup_steps: 500
+  wandb_tags: 'test-fanet'
+
+runs:
+  - config: fanet-is2re-10k
+    note: 'Base rij FANet GNNs'
+    model:
+      mp_type: base
+      edge_embed_type: rij
+  - config: fanet-is2re-10k
+    note: 'Updownscale all-embeds FANet GNNs'
+    model:
+      mp_type: updownscale
+      edge_embed_type: all
+  - config: fanet-is2re-10k
+    note: 'Simple SH FANet GNNs'
+    model:
+      mp_type: simple
+      edge_embed_type: sh
+  - config: fanet-is2re-10k
+    note: 'Simple skip-co 2-layers FANet GNNs'
+    model:
+      skip_co: True
+      second_layer_MLP: True
+      edge_embed_type: all_rij
diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml
new file mode 100644
index 0000000000..65fd1a50c8
--- /dev/null
+++ b/configs/exps/gnn/test-gnn-all-1.yaml
@@ -0,0 +1,31 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:4
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+  wandb_tags: 'test-fanet'
+
+runs:
+  - config: fanet-is2re-all
+    note: 'Simple rij baseline'
+    model:
+      mp_type: simple
+      edge_embed_type: rij
+    optim:
+      lr_initial: 0.0007
+  - config: fanet-is2re-all
+    note: 'Simple rij baseline'
+    model:
+      mp_type: simple
+      edge_embed_type: rij
diff --git a/configs/exps/prop-check/symmetries.yaml b/configs/exps/prop-check/symmetries.yaml
index cfdecbb322..0b26ce2d81 100644
--- a/configs/exps/prop-check/symmetries.yaml
+++ b/configs/exps/prop-check/symmetries.yaml
@@ -3,7 +3,7 @@ job:
   cpus: 4
   gres: gpu:rtx8000:4
   partition: long
-  time: 48:00:00
+  time: 40:00:00
 
 default:
   test_ri: True
@@ -14,33 +14,76 @@ default:
     tag_hidden_channels: 64
     pg_hidden_channels: 0  # shall have been 32
     energy_head: False # False ?
-    regress_forces: from_energy
   optim:
     max_epochs: 5
   wandb_tags: 'prop-check-ICLM'
 
 runs:
+  - config: sfarinet-s2ef-2M
+    note: 'Baseline 5 epochs'
+    model: 
+      regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 2D
     fa_frames: all
+    model: 
+      regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 3D
     fa_frames: all
+    model: 
+      regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
-    frame_averaging: 3D
-    fa_frames: random
+    frame_averaging: DA
+    model: 
+      regress_forces: from_energy
+  - config: sfarinet-s2ef-2M
+    note: 'Test Force Equivariance'
+    frame_averaging: 2D
+    fa_frames: det
+    model: 
+      regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 2D
-    fa_frames: random
+    fa_frames: se3-det
+    model: 
+      regress_forces: from_energy
   - config: sfarinet-s2ef-2M
-    note: 'Test Forces SE(3)-Equivariance'
+    note: 'Test Force Equivariance'
     frame_averaging: 2D
-    fa_frames: se3-all
+    fa_frames: all
+    model: 
+      regress_forces: direct
   - config: sfarinet-s2ef-2M
-    note: 'Test Forces SE(3)-Equivariance'
+    note: 'Test Force Equivariance'
     frame_averaging: 2D
     fa_frames: se3-random
+    model: 
+      regress_forces: direct
+  - config: sfarinet-s2ef-2M
+    note: 'Test Force Equivariance'
+    frame_averaging: 2D
+    fa_frames: all
+    model: 
+      regress_forces: direct_with_gradient_target
+  - config: sfarinet-s2ef-2M
+    note: 'Test Force Equivariance'
+    frame_averaging: DA
+    model: 
+      regress_forces: direct_with_gradient_target
+  - config: sfarinet-s2ef-2M
+    note: 'No forces coefficient ! Only energy'
+    model: 
+      regress_forces: direct
+      force_coefficient: 0
+      energy_grad_coefficient: 10
+  - config: sfarinet-s2ef-2M
+    note: 'Large energy grad coef'
+    frame_averaging: DA
+    model: 
+      regress_forces: direct_with_gradient_target
+      energy_grad_coefficient: 50
diff --git a/configs/exps/prop-check/symmetries_is2re.yaml b/configs/exps/prop-check/symmetries_is2re.yaml
index 1d07dab61f..d4553cf6f7 100644
--- a/configs/exps/prop-check/symmetries_is2re.yaml
+++ b/configs/exps/prop-check/symmetries_is2re.yaml
@@ -1,5 +1,5 @@
 job:
-  mem: 32GB
+  mem: 48GB
   cpus: 4
   gres: gpu:rtx8000:4
   partition: long
@@ -17,36 +17,35 @@ default:
   wandb_tags: 'prop-check-ICLM'
 
 runs:
-  - config: sfarinet-is2re-all
-    note: 'Baseline'
-  - config: sfarinet-is2re-all
-    note: 'Test 2D all symmetries IS2RE + FA correctness'
-    frame_averaging: 2D
-    fa_frames: all
   - config: sfarinet-is2re-all
     note: 'Test 3D all symmetries IS2RE + FA correctness'
-    frame_averaging: 3D
+    frame_averaging: DA
     fa_frames: all
   - config: sfarinet-is2re-all
     note: 'Test 3D se3-all symmetries IS2RE + FA correctness'
     frame_averaging: 3D
     fa_frames: se3-all
   - config: sfarinet-is2re-all
-    note: 'Test 2D random symmetries IS2RE + FA correctness'
+    note: '2D det symmetries IS2RE + FA correctness'
     frame_averaging: 2D
-    fa_frames: random
+    fa_frames: det
   - config: sfarinet-is2re-all
-    note: 'Test 2D se3-random symmetries IS2RE + FA correctness'
-    frame_averaging: 2D
-    fa_frames: se3-random
+    note: '3D det symmetries IS2RE + FA correctness'
+    frame_averaging: 3D
+    fa_frames: det
   - config: sfarinet-is2re-all
-    note: 'Test 2D random symmetries IS2RE + FA correctness more epochs'
+    note: '2D se3-random 30 epochs symmetries IS2RE + FA correctness'
     frame_averaging: 2D
-    fa_frames: random
-    optim:
+    fa_frames: se3-random
+    optim: 
       max_epochs: 30
   - config: sfarinet-is2re-all
-    note: 'Test invariance of DA more epochs'
-    optim:
+    note: 'Baseline 30 epochs symmetries IS2RE + FA correctness'
+    optim: 
       max_epochs: 30
-    frame_averaging: 'DA'
+  - config: sfarinet-is2re-all
+    note: '2D all 30 epochs symmetries IS2RE + FA correctness'
+    frame_averaging: 2D
+    fa_frames: all
+    optim: 
+      max_epochs: 30
\ No newline at end of file
diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml
index c03ba706cc..9840d6432f 100644
--- a/configs/models/fanet.yaml
+++ b/configs/models/fanet.yaml
@@ -2,8 +2,8 @@ default:
   model:
     name: fanet
     act: swish
-    hidden_channels: 256
-    num_filters: 128
+    hidden_channels: 128
+    num_filters: 100
     num_interactions: 3
     num_gaussians: 100
     cutoff: 6.0
@@ -20,11 +20,12 @@ default:
     second_layer_MLP: False # in EmbeddingBlock
     edge_embed_type: rij # {'rij','all_rij','sh', 'all'})
     mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}
-    force_decoder_type: "simple" # can be {"" or "simple"} | only used if regress_forces is True
+    force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True
     force_decoder_model_config:
       simple:
         hidden_channels: 128
-
+      mlp:
+        hidden_channels: 256
   optim:
     batch_size: 64
     eval_batch_size: 64
@@ -33,6 +34,9 @@ default:
     lr_initial: 0.001
     warmup_factor: 0.2
     max_epochs: 20
+    energy_grad_coefficient: 10
+    force_coefficient: 30
+    energy_coefficient: 1
 
   frame_averaging: False # 2D, 3D, da, False
   fa_frames: False # can be {None, full, random, det, e3, e3-random, e3-det}
@@ -53,6 +57,8 @@ is2re:
       max_epochs: 20
 
   100k:
+    model:
+      hidden_channels: 256
     optim:
       lr_initial: 0.005
       lr_milestones: # epochs at which lr_initial <- lr_initial * lr_gamma
@@ -63,14 +69,17 @@ is2re:
       max_epochs: 20
 
   all:
+    model:
+      hidden_channels: 384
+      num_interactions: 4
     optim:
       lr_initial: 0.001
       lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma
-        - 16000
-        - 25000
-        - 35000
+        - 18000
+        - 27000
+        - 37000
       warmup_steps: 5394
-      max_epochs: 17
+      max_epochs: 20
 
 # ------------------
 # -----  S2EF  -----
@@ -79,15 +88,27 @@ is2re:
 s2ef:
   default:
     model:
+      num_interactions: 4
+      hidden_channels: 750
+      num_gaussians: 200
+      num_filters: 256
       regress_forces: "direct"
       force_coefficient: 30
       energy_grad_coefficient: 10
     optim:
-      warmup_steps: 5394
+      batch_size: 48
+      eval_batch_size: 48
+      warmup_steps: 25000
+      warmup_factor: 0.2
+      lr_gamma: 0.1
+      lr_initial: 0.0002
+      max_epochs: 20
+      warmup_steps: 20000
       lr_milestones:
-        - 17981
-        - 26972
-        - 35963
+        - 50000
+        - 70000
+        - 90000
+
   200k: {}
 
   2M: {}
@@ -98,28 +119,45 @@ s2ef:
 
 qm9:
   default:
+    model:
+      hidden_channels: 150
+      num_gaussians: 100
+      num_filters: 128
+      num_interactions: 6
+      cutoff: 5.0
     optim:
+      batch_size: 1024
       lr_initial: 0.001
-      lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma
-        - 16000
-        - 25000
-        - 35000
-      warmup_steps: 5394
-      max_epochs: 17
+      max_epochs: 1000
+      decay_steps: 125000
+      decay_rate: 0.01
+      ema_decay: 0.999
+      lr_gamma: 0.25
+      lr_milestones:
+        - 17981
+        - 26972
+        - 35963
+        - 52000
+        - 100000
+      warmup_steps: 1000
 
   10k: {}
   all: {}
 
 qm7x:
   default:
+    model:
+      hidden_channels: 384
+      num_interactions: 4
+
     optim:
       lr_initial: 0.001
-      lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma
-        - 16000
-        - 25000
-        - 35000
+      lr_milestones:
+        - 17981
+        - 26972
+        - 35963
       warmup_steps: 5394
-      max_epochs: 20
+      max_epochs: 17
 
   all: {}
   1k: {}
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 17deea8eac..4a4cca9a5e 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -408,7 +408,7 @@ class FANet(BaseModel):
             of the edge embedding block.
         edge_embed_hidden (int): size of edge representation.
             could be num_filters or hidden_channels.
-        mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env}):
+        mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}):
             specificies the MP of the interaction block.
     """
 

From 4c5f0556e1832bb50067a2803d62dfd364db96fe Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Fri, 6 Jan 2023 08:25:57 -0500
Subject: [PATCH 020/273] fix no gradient issue FANet 2 layers

---
 configs/exps/gnn/test-gnn-1.yaml |  3 ++-
 ocpmodels/models/fanet.py        | 15 +++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/configs/exps/gnn/test-gnn-1.yaml b/configs/exps/gnn/test-gnn-1.yaml
index 223f94d929..5c3b23f4ab 100644
--- a/configs/exps/gnn/test-gnn-1.yaml
+++ b/configs/exps/gnn/test-gnn-1.yaml
@@ -15,9 +15,10 @@ default:
     pg_hidden_channels: 0  # shall have been 32
     energy_head: 'weighted-av-initial-embeds' # False ?
   optim:
-    lr_initial: 0.002
+    lr_initial: 0.0035
     lr_gamma: 0.07
     warmup_steps: 500
+    max_epochs: 25
   wandb_tags: 'test-fanet'
 
 runs:
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 4a4cca9a5e..37a58f61f2 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -114,23 +114,22 @@ def __init__(
         # TODO: change some num_filters to edge_embed_hidden
         if self.edge_embed_type == "rij":
             self.lin_e1 = Linear(3, num_filters)
-            self.lin_e2 = Linear(num_filters, num_filters)
         elif self.edge_embed_type == "all_rij":
             self.lin_e1 = Linear(3, num_filters // 3)  # r_ij
             self.lin_e12 = Linear(3, num_filters // 3)  # norm r_ij
             self.lin_e13 = Linear(
                 num_gaussians, num_filters - 2 * (num_filters // 3)
             )  # d_ij
-            self.lin_e2 = Linear(num_filters, num_filters)  # mlp of concat
         elif self.edge_embed_type == "sh":
             self.lin_e1 = Linear(15, num_filters)
-            self.lin_e2 = Linear(num_filters, num_filters)
         elif self.edge_embed_type == "all":
             self.lin_e1 = Linear(18, num_filters)
-            self.lin_e2 = Linear(num_filters, num_filters)
         else:
             raise ValueError("edge_embedding_type does not exist")
 
+        if self.second_layer_MLP:
+            self.lin_e2 = Linear(num_filters, num_filters)
+
         self.reset_parameters()
 
     def reset_parameters(self):
@@ -144,13 +143,13 @@ def reset_parameters(self):
             self.group_embedding.reset_parameters()
         nn.init.xavier_uniform_(self.lin.weight)
         self.lin.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.lin_e1.weight)
+        self.lin_e1.bias.data.fill_(0)
         if self.second_layer_MLP:
             nn.init.xavier_uniform_(self.lin_2.weight)
             self.lin_2.bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.lin_e1.weight)
-        self.lin_e1.bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.lin_e2.weight)
-        self.lin_e2.bias.data.fill_(0)
+            nn.init.xavier_uniform_(self.lin_e2.weight)
+            self.lin_e2.bias.data.fill_(0)
         if self.edge_embed_type == "all_rij":
             nn.init.xavier_uniform_(self.lin_e12.weight)
             self.lin_e12.bias.data.fill_(0)

From 7a4f77323eb86a869f953eb24d3bba054f911c09 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 6 Jan 2023 11:51:12 -0500
Subject: [PATCH 021/273] add LinearWarmupCosineAnnealingLR

---
 ocpmodels/modules/scheduler.py | 38 ++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index 223ec24447..cf8cae1b64 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -3,6 +3,7 @@
 import torch.optim.lr_scheduler as lr_scheduler
 
 from ocpmodels.common.utils import warmup_lr_lambda
+import pytorch_warmup as warmup
 
 
 class LRScheduler:
@@ -20,20 +21,31 @@ class LRScheduler:
         optimizer (obj): torch optim object
     """
 
-    def __init__(self, optimizer, config):
+    def __init__(self, optimizer, optim_config):
         self.optimizer = optimizer
-        self.config = config.copy()
-        if "scheduler" in self.config:
-            self.scheduler_type = self.config["scheduler"]
+        self.optim_config = optim_config.copy()
+        self.warmup_scheduler = None
+        if "scheduler" in self.optim_config:
+            self.scheduler_type = self.optim_config["scheduler"]
         else:
             self.scheduler_type = "LambdaLR"
-            scheduler_lambda_fn = lambda x: warmup_lr_lambda(x, self.config)
-            self.config["lr_lambda"] = scheduler_lambda_fn
+            scheduler_lambda_fn = lambda x: warmup_lr_lambda(x, self.optim_config)
+            self.optim_config["lr_lambda"] = scheduler_lambda_fn
 
-        if self.scheduler_type != "Null":
+        if (
+            self.scheduler_type != "Null"
+            and self.scheduler_type != "LinearWarmupCosineAnnealingLR"
+        ):
             self.scheduler = getattr(lr_scheduler, self.scheduler_type)
-            scheduler_args = self.filter_kwargs(config)
+            scheduler_args = self.filter_kwargs(optim_config)
             self.scheduler = self.scheduler(optimizer, **scheduler_args)
+        elif self.scheduler_type == "WarmupCosineAnnealingLR":
+            self.warmup_scheduler = warmup.ExponentialWarmup(
+                self.optimizer, warmup_period=optim_config["warmup_steps"]
+            )
+            self.scheduler = lr_scheduler.CosineAnnealingLR(
+                self.optimizer, T_max=optim_config["max_steps"], eta_min=1e-7
+            )
 
     def step(self, metrics=None, epoch=None):
         if self.scheduler_type == "Null":
@@ -43,9 +55,13 @@ def step(self, metrics=None, epoch=None):
                 raise Exception("Validation set required for ReduceLROnPlateau.")
             self.scheduler.step(metrics)
         else:
-            self.scheduler.step()
+            if self.warmup_scheduler:
+                with self.warmup_scheduler.dampening():
+                    self.scheduler.step(epoch)
+            else:
+                self.scheduler.step()
 
-    def filter_kwargs(self, config):
+    def filter_kwargs(self, optim_config):
         # adapted from https://stackoverflow.com/questions/26515595/
         sig = inspect.signature(self.scheduler)
         filter_keys = [
@@ -55,7 +71,7 @@ def filter_kwargs(self, config):
         ]
         filter_keys.remove("optimizer")
         scheduler_args = {
-            arg: self.config[arg] for arg in self.config if arg in filter_keys
+            arg: optim_config[arg] for arg in optim_config if arg in filter_keys
         }
         return scheduler_args
 

From 261cc5b3ee6914b5138cce903026d99d40aca4cf Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 7 Jan 2023 10:37:59 -0500
Subject: [PATCH 022/273] update configs

---
 configs/exps/qm7x/fanet.yaml  | 24 ++++++++++++------------
 configs/exps/qm7x/schnet.yaml | 15 ++++-----------
 2 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/configs/exps/qm7x/fanet.yaml b/configs/exps/qm7x/fanet.yaml
index 627caca163..801c506565 100644
--- a/configs/exps/qm7x/fanet.yaml
+++ b/configs/exps/qm7x/fanet.yaml
@@ -12,23 +12,23 @@ default:
   wandb_project: ocp-qm
   mode: train
   test_ri: true
-  wandb_tags: qm7x, std/10
+  wandb_tags: qm7x #, std/10
   frame_averaging: ""
   cp_data_to_tmpdir: true
   note:
     task: name
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions, mp_type, edge_embed_type
     optim: batch_size, lr_initial
-  dataset:
-    train:
-      std_divider: 10.0
-    val_id:
-      std_divider: 10.0
-    val_ood:
-      std_divider: 10.0
+  # dataset:
+  #   train:
+  #     std_divider: 10.0
+  #   val_id:
+  #     std_divider: 10.0
+  #   val_ood:
+  #     std_divider: 10.0
   optim:
-    batch_size: 1024
-    warmup_steps: 3000
+    batch_size: 2048
+    warmup_steps: 1000
     lr_initial: 0.0005
     # parameters EMA
     ema_decay: 0.999
@@ -45,8 +45,8 @@ default:
     # archi
     hidden_channels: 256
     num_filters: 256
-    num_gaussians: 32
-    num_interactions: 4
+    num_gaussians: 100
+    num_interactions: 6
     cutoff: 6.0
     regress_forces: False
     # fanet
diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml
index 65d4d6654f..73e5ace742 100644
--- a/configs/exps/qm7x/schnet.yaml
+++ b/configs/exps/qm7x/schnet.yaml
@@ -12,7 +12,7 @@ default:
   wandb_project: ocp-qm
   mode: train
   test_ri: true
-  wandb_tags: qm7x, std/10
+  wandb_tags: qm7x
   phys_hidden_channels: 0
   phys_embeds: False
   energy_head: False
@@ -24,22 +24,15 @@ default:
     task: name
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions
     optim: batch_size, lr_initial
-  dataset:
-    train:
-      std_divider: 10.0
-    val_id:
-      std_divider: 10.0
-    val_ood:
-      std_divider: 10.0
   optim:
-    batch_size: 1024
-    warmup_steps: 3000
+    batch_size: 2048
+    warmup_steps: 1000
     lr_initial: 0.0005
     # parameters EMA
     ema_decay: 0.999
     decay_steps: 750000
     decay_rate: 0.05
-    max_steps: 1000000
+    max_steps: 200000
   model:
     hidden_channels: 256
     num_filters: 256

From 9206f33b233b54a1027071ede44ddb96e9a77b60 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Mon, 9 Jan 2023 09:32:34 -0500
Subject: [PATCH 023/273] Force MAE + many config files

---
 configs/exps/gnn/test-gnn-all-1.yaml          | 82 ++++++++++++++++++-
 configs/exps/icml/baseline_s2ef.yaml          | 17 ++--
 configs/exps/icml/test_params.yaml            | 30 +++++++
 .../exps/prop-check/symmetries_s2ef_2.yaml    | 77 +++++++++++++++++
 configs/models/dpp.yaml                       | 34 +++++---
 configs/models/schnet.yaml                    |  8 +-
 ocpmodels/trainers/base_trainer.py            | 24 ++++--
 ocpmodels/trainers/single_trainer.py          | 23 +++++-
 8 files changed, 261 insertions(+), 34 deletions(-)
 create mode 100644 configs/exps/icml/test_params.yaml
 create mode 100644 configs/exps/prop-check/symmetries_s2ef_2.yaml

diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml
index 65fd1a50c8..ea5dd8ec56 100644
--- a/configs/exps/gnn/test-gnn-all-1.yaml
+++ b/configs/exps/gnn/test-gnn-all-1.yaml
@@ -4,6 +4,7 @@ job:
   gres: gpu:rtx8000:4
   partition: long
   time: 20:00:00
+  code_loc: /home/mila/a/alexandre.duval/ocp/ocp-test/ocp
 
 default:
   test_ri: True
@@ -15,6 +16,8 @@ default:
     pg_hidden_channels: 0  # shall have been 32
     energy_head: 'weighted-av-initial-embeds' # False ?
   wandb_tags: 'test-fanet'
+  optim:
+    lr_initial: 0.0008
 
 runs:
   - config: fanet-is2re-all
@@ -22,10 +25,83 @@ runs:
     model:
       mp_type: simple
       edge_embed_type: rij
-    optim:
-      lr_initial: 0.0007
+    frame_averaging: 2D
+    fa_fames: random
+  - config: fanet-is2re-all
+    model:
+      mp_type: base
+      edge_embed_type: rij
+    frame_averaging: 2D
+    fa_fames: random
+  - config: fanet-is2re-all
+    model:
+      mp_type: base
+      edge_embed_type: all_rij
+  - config: fanet-is2re-all
+    model:
+      mp_type: base
+      edge_embed_type: all
+  - config: fanet-is2re-all
+    model:
+      mp_type: base
+      edge_embed_type: sh
   - config: fanet-is2re-all
     note: 'Simple rij baseline'
     model:
-      mp_type: simple
+      mp_type: updownscale
+      edge_embed_type: rij
+  - config: fanet-is2re-all
+    note: 'Simple rij baseline'
+    model:
+      mp_type: updownscale
       edge_embed_type: rij
+  - config: fanet-is2re-all
+    note: 'Simple rij baseline'
+    model:
+      mp_type: updownscale
+      edge_embed_type: all
+    optim:
+      lr_initial: 0.0007
+      max_epochs: 25
+
+  - config: fanet-is2re-all
+    model:
+      mp_type: base
+      edge_embed_type: all
+      hidden_channels: 500
+      num_interactions: 6
+      num_filters: 200
+      num_gaussians: 200
+    frame_averaging: 2D
+    fa_fames: random
+    optim:
+      lr_initial: 0.0005
+      max_epochs: 25
+
+  - config: fanet-is2re-all
+    model:
+      mp_type: base
+      edge_embed_type: all
+      hidden_channels: 500
+      num_interactions: 6
+      num_filters: 200
+      num_gaussians: 200
+    frame_averaging: 2D
+    fa_fames: random
+    optim:
+      lr_initial: 0.0002
+      max_epochs: 25
+
+  - config: fanet-is2re-all
+    model:
+      mp_type: base
+      edge_embed_type: all
+      hidden_channels: 500
+      num_interactions: 6
+      num_filters: 200
+      num_gaussians: 200
+    frame_averaging: 2D
+    fa_fames: random
+    optim:
+      lr_initial: 0.0007
+      max_epochs: 25
diff --git a/configs/exps/icml/baseline_s2ef.yaml b/configs/exps/icml/baseline_s2ef.yaml
index fa66d3a6d5..71312e50b3 100644
--- a/configs/exps/icml/baseline_s2ef.yaml
+++ b/configs/exps/icml/baseline_s2ef.yaml
@@ -1,18 +1,25 @@
 job:
-  mem: 32GB
+  mem: 48GB
   cpus: 4
-  gres: gpu:rtx8000:4
+  gres: gpu:rtx8000:1
   partition: long
-  time: 36:00:00
+  time: 42:00:00
 
 default:
   test_ri: True
   mode: train
-  wandb_tags: 'Baseline 2 gpus'
+  wandb_tags: 'Baseline dpp 1 Gpus'
 
 runs:
+  - config: dpp-s2ef-2M
+    note: 'Baseline Schnet S2EF'
+    optim:
+      batch_size: 368
+      eval_batch_size: 368
   - config: schnet-s2ef-2M
     note: 'Baseline Schnet S2EF'
     optim:
       max_epochs: 15
-      force_coefficient: 50
\ No newline at end of file
+      force_coefficient: 50
+      batch_size: 192
+      eval_batch_size: 192
diff --git a/configs/exps/icml/test_params.yaml b/configs/exps/icml/test_params.yaml
new file mode 100644
index 0000000000..85a48351e5
--- /dev/null
+++ b/configs/exps/icml/test_params.yaml
@@ -0,0 +1,30 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: False # False ?
+  optim:
+    max_epochs: 10
+  wandb_tags: 'prop-check-ICLM'
+
+runs:
+  - config: sfarinet-s2ef-2M
+    note: 'All No TMP 1 GPU with grad target'
+    model:
+      regress_forces: direct
+    optim:
+      batch_size: 192
+      eval_batch_size: 192
+    frame_averaging: 2D
+    fa_frames: all
diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml
new file mode 100644
index 0000000000..9abfc02b40
--- /dev/null
+++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml
@@ -0,0 +1,77 @@
+job:
+  mem: 48GB
+  cpus: 4
+  gres: gpu:rtx8000:4
+  partition: long
+  time: 40:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: False # False ?
+  optim:
+    max_epochs: 5
+  wandb_tags: 'prop-check-ICLM'
+
+runs:
+  - config: sfarinet-s2ef-2M
+    note: 'Baseline 5 epochs'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      regress_forces: from_energy
+  - config: sfarinet-s2ef-2M
+    note: 'Baseline 5 epochs'
+    frame_averaging: 3D
+    fa_frames: all
+    model:
+      regress_forces: from_energy
+
+  - config: sfarinet-s2ef-2M
+    note: '2D all gradient'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      regress_forces: direct_with_gradient_target
+  - config: sfarinet-s2ef-2M
+    note: '2d all no gradient'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      regress_forces: direct
+
+  - config: sfarinet-s2ef-2M
+    note: 'Big energy grad coef'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      energy_grad_coefficient: 100
+      force_coefficient: 30
+      energy_coefficient: 1
+  - config: sfarinet-s2ef-2M
+    note: 'Big energy grad coef'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      energy_grad_coefficient: 100
+      force_coefficient: 30
+      energy_coefficient: 1
+  - config: sfarinet-s2ef-2M
+    note: 'No energy coef'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      energy_grad_coefficient: 100
+      force_coefficient: 30
+      energy_coefficient: 0
diff --git a/configs/models/dpp.yaml b/configs/models/dpp.yaml
index 75b50b8f9d..4b973595f4 100644
--- a/configs/models/dpp.yaml
+++ b/configs/models/dpp.yaml
@@ -87,31 +87,30 @@ s2ef:
     optim:
       num_workers: 8
       eval_every: 10000
+
   200k:
     optim:
       # *** Important note ***
-      #   The total number of gpus used for this run was 16.
+      #   The total number of gpus used for this run was 4.
       #   If the global batch size (num_gpus * batch_size) is modified
       #   the lr_milestones and warmup_steps need to be adjusted accordingly.
-      batch_size: 12
-      eval_batch_size: 12
+      batch_size: 48
+      eval_batch_size: 48
       lr_initial: 0.00001
       lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma
         - 5208
         - 8333
         - 10416
       warmup_steps: 3125
-      max_epochs: 30
+      max_epochs: 10
       force_coefficient: 50
 
   2M:
     optim:
-      # *** Important note ***
-      #   The total number of gpus used for this run was 32.
-      #   If the global batch size (num_gpus * batch_size) is modified
-      #   the lr_milestones and warmup_steps need to be adjusted accordingly.
-      batch_size: 12
-      eval_batch_size: 12
+      batch_size: 96
+      eval_batch_size: 96
+      eval_every: 10000
+      num_workers: 8
       lr_initial: 0.0001
       lr_gamma: 0.1
       lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma
@@ -119,8 +118,21 @@ s2ef:
         - 31250
         - 41666
       warmup_steps: 10416
-      max_epochs: 15
+      warmup_factor: 0.2
+      max_epochs: 5
       force_coefficient: 50
+    model:
+      hidden_channels: 192
+      out_emb_channels: 192
+      num_blocks: 3
+      cutoff: 6.0
+      num_radial: 6
+      num_spherical: 7
+      num_before_skip: 1
+      num_after_skip: 2
+      num_output_layers: 3
+      regress_forces: True
+      use_pbc: True
 
   20M:
     optim:
diff --git a/configs/models/schnet.yaml b/configs/models/schnet.yaml
index 217d052cae..48b7fcc544 100644
--- a/configs/models/schnet.yaml
+++ b/configs/models/schnet.yaml
@@ -82,11 +82,11 @@ s2ef:
       num_gaussians: 200
     optim:
       # *** Important note ***
-      #   The total number of gpus used for this run was 8.
+      #   The total number of gpus used for this run was 4.
       #   If the global batch size (num_gpus * batch_size) is modified
       #   the lr_milestones and warmup_steps need to be adjusted accordingly.
-      batch_size: 24
-      eval_batch_size: 24
+      batch_size: 48
+      eval_batch_size: 48
       num_workers: 16
       lr_initial: 0.0001
       lr_gamma: 0.1
@@ -95,7 +95,7 @@ s2ef:
         - 83333
         - 104166
       warmup_steps: 31250
-      max_epochs: 30
+      max_epochs: 20
       force_coefficient: 100
 
   200k:
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 28b24a80df..f2f69b9bb2 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -20,12 +20,12 @@
 import torch.nn as nn
 import torch.optim as optim
 import yaml
+from rich.console import Console
+from rich.table import Table
 from torch.nn.parallel.distributed import DistributedDataParallel
 from torch.utils.data import DataLoader
 from torch_geometric.data import Batch
 from tqdm import tqdm
-from rich.table import Table
-from rich.console import Console
 
 from ocpmodels.common import distutils
 from ocpmodels.common.data_parallel import (
@@ -35,7 +35,8 @@
 )
 from ocpmodels.common.graph_transforms import RandomReflect, RandomRotate
 from ocpmodels.common.registry import registry
-from ocpmodels.common.utils import get_commit_hash, save_checkpoint, JOB_ID
+from ocpmodels.common.timer import Times
+from ocpmodels.common.utils import JOB_ID, get_commit_hash, save_checkpoint
 from ocpmodels.datasets.data_transforms import FrameAveraging, get_transforms
 from ocpmodels.modules.evaluator import Evaluator
 from ocpmodels.modules.exponential_moving_average import (
@@ -44,7 +45,6 @@
 from ocpmodels.modules.loss import DDPLoss, L2MAELoss
 from ocpmodels.modules.normalizer import Normalizer
 from ocpmodels.modules.scheduler import LRScheduler
-from ocpmodels.common.timer import Times
 
 
 @registry.register_trainer("base")
@@ -725,7 +725,8 @@ def eval_all_splits(
         """Evaluate model on all four validation splits"""
 
         cumulated_time = 0
-        cumulated_mae = 0
+        cumulated_energy_mae = 0
+        cumulated_forces_mae = 0
         metrics_dict = {}
         # store all non-train splits: all vals and test
         all_splits = [s for s in self.config["dataset"] if s.startswith("val")]
@@ -759,7 +760,9 @@ def eval_all_splits(
                 return "SIGTERM"
 
             metrics_dict[split] = self.metrics
-            cumulated_mae += self.metrics["energy_mae"]["metric"]
+            cumulated_energy_mae += self.metrics["energy_mae"]["metric"]
+            if self.config["model"].get("regress_forces", False):
+                cumulated_forces_mae += self.metrics["forces_mae"]["metric"]
             cumulated_time += time.time() - start_time
             if metrics_names is None:
                 metrics_names = list(self.metrics.keys())
@@ -777,12 +780,15 @@ def eval_all_splits(
 
         # Log specific metrics
         if final and self.config["logger"] == "wandb" and distutils.is_master():
-            overall_mae = cumulated_mae / len(all_splits)
+            overall_energy_mae = cumulated_energy_mae / len(all_splits)
             self.logger.log({"Eval time": cumulated_time})
-            self.logger.log({"Overall MAE": overall_mae})
+            self.logger.log({"Overall MAE": overall_energy_mae})
+            if self.config["model"].get("regress_forces", False):
+                overall_forces_mae = cumulated_forces_mae / len(all_splits)
+                self.logger.log({"Overall Forces MAE": overall_forces_mae})
             if self.logger.ntfy:
                 self.logger.ntfy(
-                    message=f"{JOB_ID} - Overall MAE: {overall_mae}",
+                    message=f"{JOB_ID} - Overall MAE: {overall_energy_mae}",
                     click=self.logger.url,
                 )
 
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 388afa127c..3e8529f67e 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -202,6 +202,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         start_epoch = self.step // n_train
         loader_times = Times()
         epoch_times = []
+        model_run_time = 0
 
         if not self.silent:
             print("---Beginning of Training---")
@@ -231,6 +232,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                     batch = next(train_loader_iter)
 
                 # Forward, loss, backward.
+                if epoch_int == 1:
+                    s = time.time()
+
                 with torch.cuda.amp.autocast(enabled=self.scaler is not None):
                     preds = self.model_forward(batch)
                     loss = self.compute_loss(preds, batch)
@@ -238,6 +242,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                         coeff = self.config["optim"].get("pooling_coefficient", 1)
                         loss["total_loss"] += preds["pooling_loss"] * coeff
 
+                if epoch_int == 1:
+                    model_run_time += time.time() - s
+
                 loss = {
                     k: self.scaler.scale(v) if self.scaler else v
                     for k, v in loss.items()
@@ -350,6 +357,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 log_epoch_times = True
             self.model_forward(batch)
             self.logger.log({"Batch time": time.time() - start_time})
+            self.logger.log({"Model run time": model_run_time / len(self.train_loader)})
             if log_epoch_times:
                 self.logger.log({"Epoch time": sum(epoch_times) / len(epoch_times)})
 
@@ -666,8 +674,19 @@ def test_model_symmetries(self, debug_batches=-1):
             reflected = self.reflect_graph(batch)
             preds3 = self.model_forward(reflected["batch_list"])
             energy_diff_refl += torch.abs(preds1["energy"] - preds3["energy"]).sum()
-            if self.task_name == "s2ef":
-                forces_diff_refl += torch.abs(preds1["forces"] - preds3["forces"]).sum()
+            if self.task_name == "s2ef":            
+                forces_diff_refl += torch.abs(
+                    preds1["forces"] @ reflected["rot"].to(preds1["forces"].device)
+                    - preds3["forces"]
+                ).sum()
+                # assert torch.allclose(
+                #     torch.abs(
+                #         batch[0].force @ reflected["rot"].to(batch[0].force.device)
+                #         - reflected["batch_list"][0].force # .to(batch[0].force.device)
+                #     ).sum(),
+                #     torch.tensor([0.0]),   # .to(batch[0].force.device)
+                #     atol=1e-05,
+                # )
 
             # 3D Rotation and compute diff in prediction
             rotated = self.rotate_graph(batch)

From 8d4947d31c15c2ac094ca416cf794b304709271b Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 14:23:25 -0500
Subject: [PATCH 024/273] various bug fixes from `orion` branch

---
 launch_exp.py                      |  4 +--
 ocpmodels/common/utils.py          | 51 +++++++++++++++---------------
 ocpmodels/datasets/lmdb_dataset.py | 11 ++++---
 ocpmodels/modules/scheduler.py     | 13 +++++---
 4 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index 16f5932b2b..c2a16c76f5 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -29,7 +29,7 @@ def merge_dicts(dict1: dict, dict2: dict):
 
     Returns
     -------
-    return_dict_and_duplicates: tuple(dict, list(str))
+    return_dict: dict
         Merged dictionaries.
     """
     if not isinstance(dict1, dict):
@@ -51,7 +51,7 @@ def merge_dicts(dict1: dict, dict2: dict):
                         f"List for key {k} has different length in dict1 and dict2."
                         + " Use an empty dict {} to pad for items in the shorter list."
                     )
-                return_dict[k] = [merge_dicts(d1, d2)[0] for d1, d2 in zip(dict1[k], v)]
+                return_dict[k] = [merge_dicts(d1, d2) for d1, d2 in zip(dict1[k], v)]
             else:
                 return_dict[k] = dict2[k]
 
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 1938cf98b8..fe2a524c46 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1,4 +1,4 @@
-"""
+"""utils.py
 Copyright (c) Facebook, Inc. and its affiliates.
 
 This source code is licensed under the MIT license found in the
@@ -50,26 +50,33 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config):
     ):
         return trainer_config
 
+    print("\nMoving data to slurm tmpdir", flush=True)
+
     tmp_dir = Path(f"/Tmp/slurm.{JOB_ID}.0")
     for s, split in trainer_config["dataset"].items():
         if not isinstance(split, dict):
             continue
-        new_dir = tmp_dir / Path(split["src"]).name
+        original = Path(split["src"])
+        if original.is_file():
+            original = original.parent
+        new_dir = tmp_dir / original.name
         if new_dir.exists():
             print(
                 f"Data already copied to {str(new_dir)} for split",
                 f"{s} with source path {split['src']}",
+                flush=True,
             )
             trainer_config["dataset"][s]["src"] = str(new_dir)
             continue
+        print("Making new_dir: ", str(new_dir), flush=True)
         new_dir.mkdir()
-        command = ["rsync", "-av", f'{split["src"]}/', str(new_dir)]
-        print("Copying data: ", " ".join(command))
+        command = ["cp", "-r", f"{str(original)}", str(new_dir.parent)]
+        print("Copying data: ", " ".join(command), flush=True)
         subprocess.run(command)
         for f in new_dir.glob("*.lmdb-lock"):
             f.unlink()
         trainer_config["dataset"][s]["src"] = str(new_dir)
-    print("Done moving data to", str(new_dir))
+        print("Done moving data to", str(new_dir), flush=True)
     return trainer_config
 
 
@@ -98,7 +105,7 @@ def override_narval_paths(trainer_config):
         "with",
         path_overrides[task][split],
     )
-    trainer_config["dataset"], _ = merge_dicts(
+    trainer_config["dataset"] = merge_dicts(
         trainer_config["dataset"], path_overrides[task][split]
     )
 
@@ -702,11 +709,11 @@ def load_config(config_str):
     assert "default" in task_conf
     assert split in task_conf
 
-    config, _ = merge_dicts({}, model_conf["default"])
-    config, _ = merge_dicts(config, model_conf[task].get("default", {}))
-    config, _ = merge_dicts(config, model_conf[task][split])
-    config, _ = merge_dicts(config, task_conf["default"])
-    config, _ = merge_dicts(config, task_conf[split])
+    config = merge_dicts({}, model_conf["default"])
+    config = merge_dicts(config, model_conf[task].get("default", {}))
+    config = merge_dicts(config, model_conf[task][split])
+    config = merge_dicts(config, task_conf["default"])
+    config = merge_dicts(config, task_conf[split])
     config["task"]["name"] = task
     config["task"]["split"] = split
 
@@ -725,11 +732,9 @@ def build_config(args, args_override):
     # Check for overridden parameters.
     if args_override != []:
         overrides = create_dict_from_args(args_override)
-        config, _ = merge_dicts(config, overrides)
+        config = merge_dicts(config, overrides)
 
-    config, _ = merge_dicts(
-        config, {k: v for k, v in vars(args).items() if v is not None}
-    )
+    config = merge_dicts(config, {k: v for k, v in vars(args).items() if v is not None})
     config["data_split"] = args.config.split("-")[-1]
     config["run_dir"] = resolve(config["run_dir"])
     config["slurm"] = {}
@@ -760,7 +765,6 @@ def build_config(args, args_override):
     config = set_qm7x_target_stats(config)
     config = override_narval_paths(config)
     config = auto_note(config)
-    config = move_lmdb_data_to_slurm_tmpdir(config)
 
     if not config["no_cpus_to_workers"]:
         cpus = count_cpus()
@@ -1105,7 +1109,7 @@ def get_pruned_edge_idx(edge_index, num_atoms=None, max_neigh=1e9):
     return _nonmax_idx
 
 
-def merge_dicts(dict1: dict, dict2: dict):
+def merge_dicts(dict1: dict, dict2: dict) -> dict:
     """Recursively merge two dictionaries.
     Values in dict2 override values in dict1. If dict1 and dict2 contain a dictionary
     as a value, this will call itself recursively to merge these dictionaries.
@@ -1123,7 +1127,7 @@ def merge_dicts(dict1: dict, dict2: dict):
 
     Returns
     -------
-    return_dict_and_duplicates: tuple(dict, list(str))
+    return_dict: dict
         Merged dictionaries.
     """
     if not isinstance(dict1, dict):
@@ -1132,27 +1136,24 @@ def merge_dicts(dict1: dict, dict2: dict):
         raise ValueError(f"Expecting dict2 to be dict, found {type(dict2)}.")
 
     return_dict = copy.deepcopy(dict1)
-    duplicates = []
 
     for k, v in dict2.items():
         if k not in dict1:
             return_dict[k] = v
         else:
             if isinstance(v, dict) and isinstance(dict1[k], dict):
-                return_dict[k], duplicates_k = merge_dicts(dict1[k], dict2[k])
-                duplicates += [f"{k}.{dup}" for dup in duplicates_k]
+                return_dict[k] = merge_dicts(dict1[k], dict2[k])
             elif isinstance(v, list) and isinstance(dict1[k], list):
                 if len(dict1[k]) != len(dict2[k]):
                     raise ValueError(
                         f"List for key {k} has different length in dict1 and dict2."
                         + " Use an empty dict {} to pad for items in the shorter list."
                     )
-                return_dict[k] = [merge_dicts(d1, d2)[0] for d1, d2 in zip(dict1[k], v)]
+                return_dict[k] = [merge_dicts(d1, d2) for d1, d2 in zip(dict1[k], v)]
             else:
                 return_dict[k] = dict2[k]
-                duplicates.append(k)
 
-    return return_dict, duplicates
+    return return_dict
 
 
 class SeverityLevelBetween(logging.Filter):
@@ -1315,4 +1316,4 @@ def base_config(config, overrides={}):
         ],
     )
 
-    return merge_dicts(conf, overrides)[0]
+    return merge_dicts(conf, overrides)
diff --git a/ocpmodels/datasets/lmdb_dataset.py b/ocpmodels/datasets/lmdb_dataset.py
index 0540741c70..2eaef01200 100644
--- a/ocpmodels/datasets/lmdb_dataset.py
+++ b/ocpmodels/datasets/lmdb_dataset.py
@@ -1,4 +1,4 @@
-"""
+"""lmdb_dataset.py
 Copyright (c) Facebook, Inc. and its affiliates.
 
 This source code is licensed under the MIT license found in the
@@ -52,9 +52,12 @@ def __init__(self, config, transform=None, fa_frames=None):
             self._keys, self.envs = [], []
             for db_path in db_paths:
                 self.envs.append(self.connect_db(db_path))
-                length = pickle.loads(
-                    self.envs[-1].begin().get("length".encode("ascii"))
-                )
+                length = self.envs[-1].begin().get("length".encode("ascii"))
+                if length is not None:
+                    length = pickle.loads(length)
+                else:
+                    length = self.envs[-1].stat()["entries"]
+                assert length is not None, f"Could not find length of LMDB {db_path}"
                 self._keys.append(list(range(length)))
 
             keylens = [len(k) for k in self._keys]
diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index cf8cae1b64..dbd4106142 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -1,3 +1,5 @@
+"""scheduler.py
+"""
 import inspect
 
 import torch.optim.lr_scheduler as lr_scheduler
@@ -29,7 +31,10 @@ def __init__(self, optimizer, optim_config):
             self.scheduler_type = self.optim_config["scheduler"]
         else:
             self.scheduler_type = "LambdaLR"
-            scheduler_lambda_fn = lambda x: warmup_lr_lambda(x, self.optim_config)
+
+            def scheduler_lambda_fn(x):
+                return warmup_lr_lambda(x, self.optim_config)
+
             self.optim_config["lr_lambda"] = scheduler_lambda_fn
 
         if (
@@ -37,14 +42,14 @@ def __init__(self, optimizer, optim_config):
             and self.scheduler_type != "LinearWarmupCosineAnnealingLR"
         ):
             self.scheduler = getattr(lr_scheduler, self.scheduler_type)
-            scheduler_args = self.filter_kwargs(optim_config)
+            scheduler_args = self.filter_kwargs(self.optim_config)
             self.scheduler = self.scheduler(optimizer, **scheduler_args)
         elif self.scheduler_type == "WarmupCosineAnnealingLR":
             self.warmup_scheduler = warmup.ExponentialWarmup(
-                self.optimizer, warmup_period=optim_config["warmup_steps"]
+                self.optimizer, warmup_period=self.optim_config["warmup_steps"]
             )
             self.scheduler = lr_scheduler.CosineAnnealingLR(
-                self.optimizer, T_max=optim_config["max_steps"], eta_min=1e-7
+                self.optimizer, T_max=self.optim_config["max_steps"], eta_min=1e-7
             )
 
     def step(self, metrics=None, epoch=None):

From da4310adb5c7ad8701d7be2fc0905423016b6bf6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 15:05:27 -0500
Subject: [PATCH 025/273] print wandb query

---
 launch_exp.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/launch_exp.py b/launch_exp.py
index c2a16c76f5..9196bfb883 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -11,6 +11,15 @@
 import copy
 
 
+def util_strings(jobs, yaml_comments=False):
+    s = "All jobs launched: " + ", ".join(jobs)
+    s += "\nCancel experiment: scancel " + " ".join(jobs)
+    s += "\nWandB query for dashboard: (" + "|".join(jobs) + ")"
+    if yaml_comments:
+        s = "\n".join(["# " + line for line in s.splitlines()])
+    return s
+
+
 def merge_dicts(dict1: dict, dict2: dict):
     """Recursively merge two dictionaries.
     Values in dict2 override values in dict1. If dict1 and dict2 contain a dictionary
@@ -75,6 +84,7 @@ def write_exp_yaml_and_jobs(exp_file, outfile, jobs):
         if line.strip().startswith("- "):
             lines[run_line + i] = f"{line}  # {jobs[j]}"
             j += 1
+    lines += [""] + util_strings(jobs, True).splitlines()
     yml_out = outfile.with_suffix(".yaml")
     yml_out.write_text("\n".join(lines))
     return yml_out
@@ -194,7 +204,7 @@ def cli_arg(args, key=""):
         with outfile.open("w") as f:
             f.write(text)
         print(f"Output written to {str(outfile)}")
-        print("All job launched:", " ".join(jobs))
+        print(util_strings(jobs))
         yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs)
         print(
             "Experiment summary YAML in ", f"./{str(yml_out.relative_to(Path.cwd()))}"

From c44ccf8a1a6c070eec824e39c30d780e15c2cb90 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Mon, 9 Jan 2023 18:22:11 -0500
Subject: [PATCH 026/273] update config files

---
 configs/exps/gnn/is2re_1gpu.yaml              | 118 ++++++++++++++++++
 configs/exps/gnn/test-gnn-all-1.yaml          |   6 +-
 configs/exps/icml/baseline_s2ef.yaml          |  22 ++--
 .../exps/prop-check/symmetries_s2ef_2.yaml    |  21 +++-
 configs/models/dpp.yaml                       |   2 +-
 configs/models/fanet.yaml                     |  23 +++-
 configs/models/sfarinet.yaml                  |   5 +-
 ocpmodels/models/sfarinet.py                  |   1 +
 ocpmodels/trainers/single_trainer.py          |   4 +-
 scripts/gnn_dev.py                            |   6 +-
 10 files changed, 177 insertions(+), 31 deletions(-)
 create mode 100644 configs/exps/gnn/is2re_1gpu.yaml

diff --git a/configs/exps/gnn/is2re_1gpu.yaml b/configs/exps/gnn/is2re_1gpu.yaml
new file mode 100644
index 0000000000..5aa2ecb141
--- /dev/null
+++ b/configs/exps/gnn/is2re_1gpu.yaml
@@ -0,0 +1,118 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 30:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+  wandb_tags: 'is2re-archi-tests'
+  optim:
+    max_epochs: 5
+    batch_size: 256
+    eval_batch_size: 256
+
+runs:
+  - config: schnet-is2re-all
+    note: 'Schnet'
+  - config: sfarinet-is2re-all
+    note: 'Sfarinet test'
+    frame_averaging: 2D
+    fa_fames: se3-random
+  - config: sfarinet-is2re-all
+    note: 'Smaller lr'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.0005
+  - config: sfarinet-is2re-all
+    note: 'Sfarinet test smaller lr'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.003
+  - config: sfarinet-is2re-all
+    note: 'Bigger size'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.001
+    model:
+      hidden_channels: 500
+      num_interactions: 4
+      num_filters: 200
+      num_gaussians: 200
+  - config: sfarinet-is2re-all
+    note: 'Bigger size'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.0007
+    model:
+      hidden_channels: 500
+      num_interactions: 4
+      num_filters: 200
+      num_gaussians: 200
+  - config: sfarinet-is2re-all
+    note: 'Bigger size'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.001
+    model:
+      num_interactions: 6
+  - config: sfarinet-is2re-all
+    note: 'Bigger size and smaller lr'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.0007
+    model:
+      num_interactions: 6
+  - config: sfarinet-is2re-all
+    note: 'Bigger size and change warmup steps'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.001
+      lr_milestones:
+        - 20981
+        - 26972
+        - 35963
+      warmup_steps: 10094
+    model:
+      hidden_channels: 500
+      num_interactions: 4
+      num_filters: 200
+      num_gaussians: 200
+  - config: sfarinet-is2re-all
+    note: 'Much Bigger size'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.0007
+    model:
+      hidden_channels: 800
+      num_interactions: 4
+      num_filters: 284
+      num_gaussians: 284
+  - config: sfarinet-is2re-all
+    note: 'Smaller size more interactions'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      lr_initial: 0.001
+    model:
+      hidden_channels: 128
+      num_interactions: 6
+      num_filters: 100
+      num_gaussians: 100
+
diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml
index ea5dd8ec56..00a54ada07 100644
--- a/configs/exps/gnn/test-gnn-all-1.yaml
+++ b/configs/exps/gnn/test-gnn-all-1.yaml
@@ -17,7 +17,7 @@ default:
     energy_head: 'weighted-av-initial-embeds' # False ?
   wandb_tags: 'test-fanet'
   optim:
-    lr_initial: 0.0008
+    lr_initial: 0.0005
 
 runs:
   - config: fanet-is2re-all
@@ -61,7 +61,7 @@ runs:
       mp_type: updownscale
       edge_embed_type: all
     optim:
-      lr_initial: 0.0007
+      lr_initial: 0.0004
       max_epochs: 25
 
   - config: fanet-is2re-all
@@ -75,7 +75,7 @@ runs:
     frame_averaging: 2D
     fa_fames: random
     optim:
-      lr_initial: 0.0005
+      lr_initial: 0.0004
       max_epochs: 25
 
   - config: fanet-is2re-all
diff --git a/configs/exps/icml/baseline_s2ef.yaml b/configs/exps/icml/baseline_s2ef.yaml
index 71312e50b3..cefff0fb5e 100644
--- a/configs/exps/icml/baseline_s2ef.yaml
+++ b/configs/exps/icml/baseline_s2ef.yaml
@@ -1,25 +1,27 @@
 job:
   mem: 48GB
   cpus: 4
-  gres: gpu:rtx8000:1
+  gres: gpu:rtx8000:2
   partition: long
   time: 42:00:00
 
 default:
   test_ri: True
   mode: train
-  wandb_tags: 'Baseline dpp 1 Gpus'
+  wandb_tags: 'baseline-schnet'
 
 runs:
-  - config: dpp-s2ef-2M
-    note: 'Baseline Schnet S2EF'
-    optim:
-      batch_size: 368
-      eval_batch_size: 368
   - config: schnet-s2ef-2M
-    note: 'Baseline Schnet S2EF'
+    note: 'Baseline Schnet S2EF 2 GPU'
+    optim:
+      max_epochs: 15
+      force_coefficient: 50
+      batch_size: 96
+      eval_batch_size: 96
+  - config: schnet-is2re-2M
+    note: 'Baseline Schnet IS2RE 2 GPU'
     optim:
       max_epochs: 15
       force_coefficient: 50
-      batch_size: 192
-      eval_batch_size: 192
+      batch_size: 128
+      eval_batch_size: 128
diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml
index 9abfc02b40..aebe1b7934 100644
--- a/configs/exps/prop-check/symmetries_s2ef_2.yaml
+++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml
@@ -1,9 +1,9 @@
 job:
   mem: 48GB
   cpus: 4
-  gres: gpu:rtx8000:4
+  gres: gpu:rtx8000:1
   partition: long
-  time: 40:00:00
+  time: 20:00:00
 
 default:
   test_ri: True
@@ -16,17 +16,19 @@ default:
     energy_head: False # False ?
   optim:
     max_epochs: 5
-  wandb_tags: 'prop-check-ICLM'
+    batch_size: 196
+    eval_batch_size: 196
+  wandb_tags: 's2ef-sym-prop'
 
 runs:
   - config: sfarinet-s2ef-2M
-    note: 'Baseline 5 epochs'
+    note: 'Baseline 5 epochs 1 Gpu'
     frame_averaging: 2D
     fa_frames: all
     model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
-    note: 'Baseline 5 epochs'
+    note: 'Baseline 5 epochs 1 Gpu'
     frame_averaging: 3D
     fa_frames: all
     model:
@@ -75,3 +77,12 @@ runs:
       energy_grad_coefficient: 100
       force_coefficient: 30
       energy_coefficient: 0
+  - config: sfarinet-s2ef-2M
+    note: 'Large force coef'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      force_coefficient: 75
+      energy_coefficient: 1
\ No newline at end of file
diff --git a/configs/models/dpp.yaml b/configs/models/dpp.yaml
index 4b973595f4..6a289bbc22 100644
--- a/configs/models/dpp.yaml
+++ b/configs/models/dpp.yaml
@@ -119,7 +119,7 @@ s2ef:
         - 41666
       warmup_steps: 10416
       warmup_factor: 0.2
-      max_epochs: 5
+      max_epochs: 15
       force_coefficient: 50
     model:
       hidden_channels: 192
diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml
index 9840d6432f..5bf8ad3546 100644
--- a/configs/models/fanet.yaml
+++ b/configs/models/fanet.yaml
@@ -85,6 +85,8 @@ is2re:
 # -----  S2EF  -----
 # ------------------
 
+# For 4 GPUs 
+
 s2ef:
   default:
     model:
@@ -101,17 +103,26 @@ s2ef:
       warmup_steps: 25000
       warmup_factor: 0.2
       lr_gamma: 0.1
-      lr_initial: 0.0002
-      max_epochs: 20
+      lr_initial: 0.0001
+      max_epochs: 15
       warmup_steps: 20000
       lr_milestones:
-        - 50000
-        - 70000
-        - 90000
+        - 55000
+        - 75000
+        - 10000
 
   200k: {}
 
-  2M: {}
+  # 2 gpus
+  2M:
+    model: 
+      num_interactions: 5
+      hidden_channels: 1024
+      num_gaussians: 200
+      num_filters: 256
+    optim:
+      batch_size: 96
+      eval_batch_size: 96
 
   20M: {}
 
diff --git a/configs/models/sfarinet.yaml b/configs/models/sfarinet.yaml
index 0a1b1ed922..57bc1afdec 100644
--- a/configs/models/sfarinet.yaml
+++ b/configs/models/sfarinet.yaml
@@ -74,7 +74,8 @@ is2re:
     model:
       hidden_channels: 384
       num_interactions: 4
-
+      num_filters: 128
+      num_gaussians: 100
     optim:
       lr_initial: 0.001
       lr_milestones:
@@ -88,6 +89,8 @@ is2re:
 # -----  S2EF  -----
 # ------------------
 
+# For 4 GPUs 
+
 s2ef:
   default:
     model:
diff --git a/ocpmodels/models/sfarinet.py b/ocpmodels/models/sfarinet.py
index bdbd89cfd3..f55fc414aa 100644
--- a/ocpmodels/models/sfarinet.py
+++ b/ocpmodels/models/sfarinet.py
@@ -273,6 +273,7 @@ class SfariNet(BaseModel):
 
     def __init__(self, **kwargs):
         super().__init__()
+        torch.autograd.set_detect_anomaly(True)
         self.cutoff = kwargs["cutoff"]
         self.use_pbc = kwargs["use_pbc"]
         self.max_num_neighbors = kwargs["max_num_neighbors"]
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 3e8529f67e..3e1983d951 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -227,7 +227,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 self.epoch = epoch_int + (i + 1) / n_train
                 self.step = epoch_int * n_train + i + 1
 
-                # Get a batch.
+                # Get a batch
                 with loader_times.next("get_batch"):
                     batch = next(train_loader_iter)
 
@@ -357,7 +357,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 log_epoch_times = True
             self.model_forward(batch)
             self.logger.log({"Batch time": time.time() - start_time})
-            self.logger.log({"Model run time": model_run_time / len(self.train_loader)})
+            self.logger.log({"Model run time": model_run_time / n_train})
             if log_epoch_times:
                 self.logger.log({"Epoch time": sum(epoch_times) / len(epoch_times)})
 
diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py
index b92d21578e..d2e114c655 100644
--- a/scripts/gnn_dev.py
+++ b/scripts/gnn_dev.py
@@ -21,7 +21,7 @@
     config["frame_averaging"] = "2D"
     config["fa_frames"] = "random"  # "random"
     config["test_ri"] = True
-    config["optim"] = {"max_epochs": 1}
+    config["optim"] = {"max_epochs": 0}
     config["model"] = {"use_pbc": True}
     config["model"]["edge_embed_type"] = "rij"
     config["model"]["mp_type"] = "base"
@@ -32,8 +32,8 @@
     str_args = sys.argv[1:]
     if all("config" not in arg for arg in str_args):
         str_args.append("--is_debug")
-        str_args.append("--config=fanet-is2re-10k")
-        # str_args.append("--config=sfarinet-s2ef-2M")
+        # str_args.append("--config=fanet-is2re-10k")
+        str_args.append("--config=sfarinet-s2ef-2M")
         warnings.warn(
             "No model / mode is given; chosen as default" + f"Using: {str_args[-1]}"
         )

From 49fb8dc6ca37544776fc8dab9190638c08693440 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 23:51:52 -0500
Subject: [PATCH 027/273] early stopping & spooky params

---
 configs/exps/qm7x/schnet-from-spooky.yaml | 51 ++++++++++++++++
 ocpmodels/modules/scheduler.py            | 71 ++++++++++++++++++++++-
 ocpmodels/trainers/base_trainer.py        |  3 +-
 ocpmodels/trainers/single_trainer.py      | 29 +++++----
 4 files changed, 142 insertions(+), 12 deletions(-)
 create mode 100644 configs/exps/qm7x/schnet-from-spooky.yaml

diff --git a/configs/exps/qm7x/schnet-from-spooky.yaml b/configs/exps/qm7x/schnet-from-spooky.yaml
new file mode 100644
index 0000000000..6f2164c4d6
--- /dev/null
+++ b/configs/exps/qm7x/schnet-from-spooky.yaml
@@ -0,0 +1,51 @@
+# trainset has 4068193 samples
+job:
+  mem: 32GB
+  cpus: 8
+  gres: gpu:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
+  env: ocp-a100
+
+default:
+  config: schnet-qm7x-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x
+  phys_hidden_channels: 0
+  phys_embeds: False
+  energy_head: False
+  pg_hidden_channels: 0
+  tag_hidden_channels: 0
+  frame_averaging: ""
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions
+    optim: batch_size, lr_initial
+  optim:
+    batch_size: 10
+    warmup_steps: 1000
+    lr_initial: 0.0001
+    # parameters EMA
+    # ema_decay: 0.999
+    decay_steps: 750000
+    scheduler:
+    decay_rate: 0.01
+    max_steps: 1000000
+  model:
+    hidden_channels: 128
+    num_filters: 128
+    num_gaussians: 20
+    num_interactions: 6
+    cutoff: 5.0
+
+runs:
+  - optim:
+      ema_decay: 0.999
+  - optim:
+      scheduler: LinearWarmupCosineAnnealingLR
+  - optim:
+      ema_decay: 0.999
+      scheduler: LinearWarmupCosineAnnealingLR
diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index dbd4106142..0d993b3925 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -1,7 +1,6 @@
 """scheduler.py
 """
 import inspect
-
 import torch.optim.lr_scheduler as lr_scheduler
 
 from ocpmodels.common.utils import warmup_lr_lambda
@@ -83,3 +82,73 @@ def filter_kwargs(self, optim_config):
     def get_lr(self):
         for group in self.optimizer.param_groups:
             return group["lr"]
+
+
+class EarlyStopper:
+    """
+    Class that stores the current best metric score and monitors whether
+    it's improving or not. If it does not decrease for a certain number
+    of validation calls (with some minimal improvement) then it tells the trainer
+    to stop.
+    """
+
+    def __init__(
+        self, patience=7, mode="min", min_abs_change=1e-5, store_all_steps=True
+    ):
+        self.patience = patience
+        self.mode = mode
+        self.counter = 0
+        self.min_abs_change = min_abs_change
+        self.store_all_steps = store_all_steps
+        self.metrics = []
+
+        if self.mode == "min":
+            self.best_score = float("inf")
+        elif self.mode == "max":
+            self.best_score = float("-inf")
+        else:
+            raise ValueError("mode must be either min or max")
+
+        self.early_stop = False
+
+    def should_stop(self, metric):
+        """
+        Returns True if the metric has not improved for a certain number of
+        steps. False otherwise. Stores the metric in `self.metrics`: all the steps if
+        `self.store_all_steps` is `True`, otherwise only the last `n=self.patience`.
+
+        Args:
+            metric (Number): Metric to track.
+
+        Returns:
+            bool: Wether to stop training or not
+        """
+        metric = float(metric)
+        self.metrics.append(metric)
+        if not self.store_all_steps:
+            self.metrics = self.metrics[-self.patience :]
+
+        if self.mode == "min":
+            if metric < self.best_score - self.min_abs_change:
+                self.best_score = metric
+                self.counter = 0
+            else:
+                self.counter += 1
+        elif self.mode == "max":
+            if metric > self.best_score + self.min_abs_change:
+                self.best_score = metric
+                self.counter = 0
+            else:
+                self.counter += 1
+
+        if self.counter >= self.patience:
+            self.early_stop = True
+
+        return self.early_stop
+
+    @property
+    def reason(self):
+        return (
+            f"Early stopping after {self.counter} steps with no improvement:\n"
+            + " -> ".join([f"{m:.6f}" for m in self.metrics[-self.patience :]])
+        )
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index b56c910eb2..4804526f11 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -44,7 +44,7 @@
 )
 from ocpmodels.modules.loss import DDPLoss, L2MAELoss
 from ocpmodels.modules.normalizer import Normalizer
-from ocpmodels.modules.scheduler import LRScheduler
+from ocpmodels.modules.scheduler import LRScheduler, EarlyStopper
 
 
 @registry.register_trainer("base")
@@ -79,6 +79,7 @@ def __init__(self, **kwargs):
         self.datasets = {}
         self.samplers = {}
         self.loaders = {}
+        self.early_stopper = EarlyStopper(patience=10, min_abs_change=1e-5)
 
         if torch.cuda.is_available() and not self.cpu:
             self.device = torch.device(f"cuda:{self.config['local_rank']}")
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 3e1983d951..7dd1b0aec4 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -297,7 +297,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 # Evaluate on val set after every `eval_every` iterations.
                 if should_validate:
                     self.save(
-                        checkpoint_file=f"checkpoint-{str(self.step).zfill(6)}.pt",
+                        checkpoint_file=f"checkpoint-{str(self.step).zfill(7)}.pt",
                         training_state=True,
                     )
 
@@ -307,10 +307,13 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                         debug_batches=debug_batches,
                         is_first=first_eval,
                     )
+
                     first_eval = False
                     if val_metrics == "SIGTERM":
                         return "SIGTERM"
+
                     current_val_metric = val_metrics[primary_metric]["metric"]
+
                     if current_val_metric < self.best_val_metric:
                         self.best_val_metric = current_val_metric
                         self.save(
@@ -318,6 +321,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                             checkpoint_file="best_checkpoint.pt",
                             training_state=False,
                         )
+                    if self.early_stopper.should_stop(current_val_metric):
+                        print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n")
+                        if self.logger:
+                            self.logger.add_tags(["E-S"])
+                        return self.end_of_training()
+
                     self.model.train()
 
                 self.scheduler_step(eval_every, current_val_metric)
@@ -334,9 +343,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
             torch.cuda.empty_cache()
 
         # End of training.
+        if not is_test_env:
+            return self.end_of_training()
 
-        if is_test_env:
-            return
+    def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times):
 
         eas = self.eval_all_splits(True, epoch=epoch_int, debug_batches=debug_batches)
         if eas == "SIGTERM":
@@ -349,17 +359,16 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
 
         # Time model
         if self.logger is not None:
-            log_epoch_times = False
+            log_epoch_times = self.config["optim"]["max_epochs"] > 0
             start_time = time.time()
-            if self.config["optim"]["max_epochs"] == 0:
-                batch = next(iter(self.loaders["train"]))
-            else:
-                log_epoch_times = True
+
+            # deterministic batch because shuffle=False for validation
+            batch = next(iter(self.loaders[self.config["dataset"]["default_val"]]))
             self.model_forward(batch)
             self.logger.log({"Batch time": time.time() - start_time})
             self.logger.log({"Model run time": model_run_time / n_train})
             if log_epoch_times:
-                self.logger.log({"Epoch time": sum(epoch_times) / len(epoch_times)})
+                self.logger.log({"Epoch time": np.mean(epoch_times)})
 
         # Check respect of symmetries
         if self.test_ri and not is_test_env:
@@ -674,7 +683,7 @@ def test_model_symmetries(self, debug_batches=-1):
             reflected = self.reflect_graph(batch)
             preds3 = self.model_forward(reflected["batch_list"])
             energy_diff_refl += torch.abs(preds1["energy"] - preds3["energy"]).sum()
-            if self.task_name == "s2ef":            
+            if self.task_name == "s2ef":
                 forces_diff_refl += torch.abs(
                     preds1["forces"] @ reflected["rot"].to(preds1["forces"].device)
                     - preds3["forces"]

From 7798d2a561aeffbe8949c74c307290ca08433398 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 7 Jan 2023 19:06:40 -0500
Subject: [PATCH 028/273] fix line length

---
 sbatch.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sbatch.py b/sbatch.py
index 6edb23ca03..bb0ff8ab36 100644
--- a/sbatch.py
+++ b/sbatch.py
@@ -232,7 +232,9 @@ def add_jobid_to_log(j, command_line, exp_name=None):
             sbatch_py_vars["num-nodes"] = args.nodes
             sbatch_py_vars["num-gpus"] = args.ntasks_per_node
         else:
-            args.py_args += f" --distributed --num-nodes {args.nodes} --num-gpus {args.ntasks_per_node}"
+            args.py_args += " --distributed --num-nodes {} --num-gpus {}".format(
+                args.nodes, args.ntasks_per_node
+            )
 
     # add logdir to main.py's command-line arguments
     if "--logdir" not in args.py_args and args.logdir:

From 2ea23b82b61f51c4b08bc6123918b9639146f4b6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 7 Jan 2023 19:06:49 -0500
Subject: [PATCH 029/273] add `broadcast_object_list`

---
 ocpmodels/common/distutils.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/ocpmodels/common/distutils.py b/ocpmodels/common/distutils.py
index 32d30ebea3..7057d9695b 100644
--- a/ocpmodels/common/distutils.py
+++ b/ocpmodels/common/distutils.py
@@ -95,6 +95,12 @@ def broadcast(tensor, src, group=dist.group.WORLD, async_op=False):
     dist.broadcast(tensor, src, group, async_op)
 
 
+def broadcast_object_list(obj_list, src=0):
+    if get_world_size() == 1:
+        return
+    dist.broadcast_object_list(obj_list, src=src)
+
+
 def all_reduce(data, group=dist.group.WORLD, average=False, device=None):
     if get_world_size() == 1:
         return data

From 08bd4cc7cc7bb5034dd9a0109e945faacb96bf53 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 7 Jan 2023 19:07:03 -0500
Subject: [PATCH 030/273] add `orion` flags

---
 ocpmodels/common/flags.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index f487115b31..354b52ee75 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -240,6 +240,20 @@ def add_core_args(self):
             default=100,
             help="Log training loss every n steps",
         )
+        self.parser.add_argument(
+            "--orion_search",
+            "-o",
+            type=str,
+            help="Path to an orion search space yaml file",
+        )
+        self.parser.add_argument(
+            "--unique_exp_name",
+            "-u",
+            type=str,
+            help="Name for this experiment. If the experiment name already exists,"
+            + " the search space MUST be the same. If it is not, the job will crash."
+            + " If you change the search space, you must change the experiment name.",
+        )
 
 
 flags = Flags()

From 6a388ea6eb671c0d5aa3bcb7467e3fd34d193801 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 7 Jan 2023 19:07:11 -0500
Subject: [PATCH 031/273] declare `objective`

---
 ocpmodels/trainers/base_trainer.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 4804526f11..8b5fa7e077 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -783,14 +783,10 @@ def eval_all_splits(
             overall_energy_mae = cumulated_energy_mae / len(all_splits)
             self.logger.log({"Eval time": cumulated_time})
             self.logger.log({"Overall MAE": overall_energy_mae})
+            self.objective = overall_energy_mae
             if self.config["model"].get("regress_forces", False):
                 overall_forces_mae = cumulated_forces_mae / len(all_splits)
                 self.logger.log({"Overall Forces MAE": overall_forces_mae})
-            if self.logger.ntfy:
-                self.logger.ntfy(
-                    message=f"{JOB_ID} - Overall MAE: {overall_energy_mae}",
-                    click=self.logger.url,
-                )
 
         # Run on test split
         if final and "test" in self.config["dataset"] and self.eval_on_test:
@@ -935,3 +931,11 @@ def handle_sigterm(self, signum, _):
         if signum == 15 and not self.sigterm:
             print("\nHandling SIGTERM signal received.\n")
             self.sigterm = True
+
+    def close_datasets(self):
+        try:
+            for ds in self.datasets.values():
+                if hasattr(ds, "close_db") and callable(ds.close_db):
+                    ds.close_db()
+        except Exception as e:
+            print("Error closing datasets: ", str(e))

From 527a39e72f05b1bd1ad6f98b916307071288250f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 8 Jan 2023 01:14:01 -0500
Subject: [PATCH 032/273] fix `broadcast_object_list`device

---
 ocpmodels/common/distutils.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/common/distutils.py b/ocpmodels/common/distutils.py
index 7057d9695b..d4f4c13894 100644
--- a/ocpmodels/common/distutils.py
+++ b/ocpmodels/common/distutils.py
@@ -98,7 +98,12 @@ def broadcast(tensor, src, group=dist.group.WORLD, async_op=False):
 def broadcast_object_list(obj_list, src=0):
     if get_world_size() == 1:
         return
-    dist.broadcast_object_list(obj_list, src=src)
+    dist.broadcast_object_list(
+        obj_list,
+        src=src,
+        group=dist.group.WORLD,
+        device=torch.device(f"cuda:{get_rank()}"),
+    )
 
 
 def all_reduce(data, group=dist.group.WORLD, average=False, device=None):

From 6303248583ee9725f35ce2aa0898a3b319789476 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 8 Jan 2023 01:14:38 -0500
Subject: [PATCH 033/273] refactor to `Runner` and v0 for Orion

---
 main.py | 111 +++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 70 insertions(+), 41 deletions(-)

diff --git a/main.py b/main.py
index 1d6a2b842d..29e0e1221e 100644
--- a/main.py
+++ b/main.py
@@ -16,6 +16,8 @@
 from pathlib import Path
 
 import torch
+from orion.client import build_experiment
+from yaml import safe_load
 
 from ocpmodels.common import distutils
 from ocpmodels.common.flags import flags
@@ -23,6 +25,7 @@
 from ocpmodels.common.utils import (
     JOB_ID,
     build_config,
+    merge_dicts,
     resolve,
     setup_imports,
     setup_logging,
@@ -133,8 +136,53 @@ def print_warnings():
     print("-" * 80 + "\n")
 
 
+class Runner:
+    def __init__(self, trainer_config):
+        self.trainer_config = trainer_config
+        self.trainer = None
+
+    def run(self, **hparams):
+        self.original_config = copy.deepcopy(self.trainer_config)
+        self.hparams = hparams
+
+        should_be_0 = distutils.get_rank()
+        hp_list = [hparams, should_be_0]
+        distutils.broadcast_object_list(hp_list)
+        hparams, should_be_0 = hp_list
+        print("hparams: ", hparams)
+        print("should_be_0: ", should_be_0)
+        assert should_be_0 == 0
+        if hparams:
+            print("Received hyper-parameters from Orion:")
+            print(hparams)
+
+        self.trainer_config = merge_dicts(self.trainer_config, hparams)
+        cls = registry.get_trainer_class(self.trainer_config["trainer"])
+        self.trainer: BaseTrainer = cls(**self.trainer_config)
+        task = registry.get_task_class(self.trainer_config["mode"])(self.trainer_config)
+        task.setup(self.trainer)
+        start_time = time.time()
+        print_warnings()
+
+        signal = task.run()
+
+        # handle job preemption / time limit
+        if signal == "SIGTERM":
+            print("\nJob was preempted. Wrapping up...\n")
+            self.trainer.close_datasets()
+
+        distutils.synchronize()
+        logging.info(f"Total time taken: {time.time() - start_time}")
+        if self.trainer.logger is not None:
+            self.trainer.logger.log({"Total time": time.time() - start_time})
+
+        return [
+            {"name": "energy_mae", "type": "objective", "value": self.trainer.objective}
+        ]
+
+
 if __name__ == "__main__":
-    ntfy = trainer = error = signal = None
+    runner = error = signal = None
 
     setup_logging()
 
@@ -166,48 +214,29 @@ def print_warnings():
         setup_imports()
         trainer_config = should_continue(trainer_config)
         trainer_config = read_slurm_env(trainer_config)
+        runner = Runner(trainer_config)
         # -------------------
         # -----  Train  -----
         # -------------------
-        trainer: BaseTrainer = registry.get_trainer_class(trainer_config["trainer"])(
-            **trainer_config
-        )
-        task = registry.get_task_class(trainer_config["mode"])(trainer_config)
-        task.setup(trainer)
-        start_time = time.time()
-        if trainer.logger is not None:
-            message = f"{JOB_ID} - Training started 🚀"
-            if trainer_config.get("note"):
-                message += f" - {trainer_config.get('note')}"
-            if trainer_config.get("wandb_tags"):
-                message += f" - {trainer_config.get('wandb_tags')}"
-            trainer.logger.ntfy(message, click=trainer.logger.url)
-        print_warnings()
-
-        signal = task.run()
-
-        # handle job preemption / time limit
-        if signal == "SIGTERM":
-            print("\nJob was preempted. Wrapping up...\n")
-            for ds in trainer.datasets.values():
-                if hasattr(ds, "close_db") and callable(ds.close_db):
-                    ds.close_db()
-
-        # -----------------
-        # -----  End  -----
-        # -----------------
-        distutils.synchronize()
-        logging.info(f"Total time taken: {time.time() - start_time}")
-        if trainer.logger is not None:
-            trainer.logger.log({"Total time": time.time() - start_time})
-
-    except Exception as e:
-        if trainer and trainer.logger:
-            e_name = e.__class__.__name__
-            trainer.logger.ntfy(
-                f"{JOB_ID} - Training failed 😭" + f"{e_name} - {str(e)}",
-                click=trainer.logger.url or None,
+        if args.orion_search and distutils.is_master():
+            assert args.unique_exp_name
+            space = safe_load(Path(args.orion_search).read_text())
+            print("Search Space: ", space)
+            experiment = build_experiment(
+                name=args.unique_exp_name,
+                space=space,
+                algorithms={"mofa": {"seed": 123}},
+            )
+            experiment.workon(
+                runner.run,
+                max_trials_per_worker=1,
+                n_workers=1,
+                idle_timeout=3600 * 24 * 4,
             )
+        else:
+            runner.run()
+
+    except Exception:
         error = True
         print(traceback.format_exc())
 
@@ -220,8 +249,8 @@ def print_warnings():
             distutils.cleanup()
             print("Done!")
 
-        if trainer and trainer.logger:
-            trainer.logger.finish(error or signal)
+        if runner and runner.trainer and runner.trainer.logger:
+            runner.trainer.logger.finish(error or signal)
 
         if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read():
             print("Self-canceling SLURM job", JOB_ID)

From a249efc119ba462b579861b01499595223ac44db Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 14:14:29 -0500
Subject: [PATCH 034/273] debug print `i_for_epoch`

---
 ocpmodels/trainers/single_trainer.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 7dd1b0aec4..bb14e1636d 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -5,6 +5,7 @@
 LICENSE file in the root directory of this source tree.
 """
 
+import datetime
 import logging
 import os
 import time
@@ -41,6 +42,10 @@ class SingleTrainer(BaseTrainer):
         can be found in `configs/ocp_is2re <https://github.com/Open-Catalyst-Project/baselines/tree/master/configs/ocp_is2re/>`_. # noqa: E501
     """
 
+    @property
+    def now(self):
+        return str(datetime.datetime.now()).split(".")[0]
+
     def load_task(self):
         if not self.silent:
             logging.info(f"Loading dataset: {self.config['task']['dataset']}")
@@ -200,7 +205,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         # Calculate start_epoch from step instead of loading the epoch number
         # to prevent inconsistencies due to different batch size in checkpoint.
         start_epoch = self.step // n_train
-        loader_times = Times()
+        timer = Times()
         epoch_times = []
         model_run_time = 0
 
@@ -224,11 +229,12 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 if self.sigterm:
                     return "SIGTERM"
                 i_for_epoch += 1
+                print(self.now, "i_for_epoch: ", i_for_epoch, flush=True)
                 self.epoch = epoch_int + (i + 1) / n_train
                 self.step = epoch_int * n_train + i + 1
 
-                # Get a batch
-                with loader_times.next("get_batch"):
+                # Get a batch.
+                with timer.next("get_batch"):
                     batch = next(train_loader_iter)
 
                 # Forward, loss, backward.
@@ -272,10 +278,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                         )
 
                     # Log metrics.
-                    gbm, gbs = loader_times.prepare_for_logging()
+                    gbm, gbs = timer.prepare_for_logging()
                     self.metrics["get_batch_time_mean"] = {"metric": gbm["get_batch"]}
                     self.metrics["get_batch_time_std"] = {"metric": gbs["get_batch"]}
-                    loader_times.reset()
+                    timer.reset()
                     # logging.info(f"Step: {self.step}")
                     self.log_train_metrics()
 

From cde8e1ae18a0cde999f23d0866fb5372702639ff Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 14:15:09 -0500
Subject: [PATCH 035/273] add debug prints

---
 main.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/main.py b/main.py
index 29e0e1221e..90282124ac 100644
--- a/main.py
+++ b/main.py
@@ -30,6 +30,7 @@
     setup_imports,
     setup_logging,
     update_from_sbatch_py_vars,
+    move_lmdb_data_to_slurm_tmpdir,
 )
 from ocpmodels.trainers import BaseTrainer
 
@@ -147,10 +148,10 @@ def run(self, **hparams):
 
         should_be_0 = distutils.get_rank()
         hp_list = [hparams, should_be_0]
+        # print("hparams pre-broadcast: ", hparams)
         distutils.broadcast_object_list(hp_list)
         hparams, should_be_0 = hp_list
-        print("hparams: ", hparams)
-        print("should_be_0: ", should_be_0)
+        # print("hparams post-broadcast: ", hparams)
         assert should_be_0 == 0
         if hparams:
             print("Received hyper-parameters from Orion:")
@@ -176,9 +177,14 @@ def run(self, **hparams):
         if self.trainer.logger is not None:
             self.trainer.logger.log({"Total time": time.time() - start_time})
 
-        return [
-            {"name": "energy_mae", "type": "objective", "value": self.trainer.objective}
-        ]
+        objective = self.trainer.objective
+        # print("objective pre-broadcast: ", objective)
+        o_list = [objective]
+        distutils.broadcast_object_list(o_list)
+        objective = o_list[0]
+        # print("objective post-broadcast: ", objective)
+
+        return [{"name": "energy_mae", "type": "objective", "value": objective}]
 
 
 if __name__ == "__main__":
@@ -206,15 +212,22 @@ def run(self, **hparams):
 
     if args.distributed:
         distutils.setup(trainer_config)
+        print("Distributed backend setup.")
+
+    if distutils.is_master():
+        trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config)
+        # distutils.synchronize()
 
     try:
         # -------------------
         # -----  Setup  -----
         # -------------------
         setup_imports()
+        print("All things imported.")
         trainer_config = should_continue(trainer_config)
         trainer_config = read_slurm_env(trainer_config)
         runner = Runner(trainer_config)
+        print("Runner ready.")
         # -------------------
         # -----  Train  -----
         # -------------------
@@ -234,6 +247,7 @@ def run(self, **hparams):
                 idle_timeout=3600 * 24 * 4,
             )
         else:
+            print("Starting runner.")
             runner.run()
 
     except Exception:
@@ -243,7 +257,7 @@ def run(self, **hparams):
     finally:
         if args.distributed:
             print(
-                "Waiting for all processes to finish with distutils.cleanup()...",
+                "\nWaiting for all processes to finish with distutils.cleanup()...",
                 end="",
             )
             distutils.cleanup()
@@ -253,5 +267,5 @@ def run(self, **hparams):
             runner.trainer.logger.finish(error or signal)
 
         if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read():
-            print("Self-canceling SLURM job", JOB_ID)
+            print("\nSelf-canceling SLURM job", JOB_ID)
             os.system(f"scancel {JOB_ID}")

From 48b83796313f8c4b29bac795bc398c934813776f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 14:16:23 -0500
Subject: [PATCH 036/273] read from scratch

---
 configs/models/tasks/is2re.yaml | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/configs/models/tasks/is2re.yaml b/configs/models/tasks/is2re.yaml
index fe7ed92187..059ef62c53 100644
--- a/configs/models/tasks/is2re.yaml
+++ b/configs/models/tasks/is2re.yaml
@@ -18,30 +18,27 @@ default:
   dataset:
     default_val: val_id
     train:
-      src: /network/projects/_groups/ocp/oc20/is2re/all/train/data.lmdb
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/train/
       normalize_labels: True
       target_mean: -1.525913953781128
       target_std: 2.279365062713623
     val_id:
-      src: /network/projects/_groups/ocp/oc20/is2re/all/val_id/data.lmdb
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_id/
     val_ood_cat:
-      src: /network/projects/_groups/ocp/oc20/is2re/all/val_ood_cat/data.lmdb
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_cat/
     val_ood_ads:
-      src: /network/projects/_groups/ocp/oc20/is2re/all/val_ood_ads/data.lmdb
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_ads/
     val_ood_both:
-      src: /network/projects/_groups/ocp/oc20/is2re/all/val_ood_both/data.lmdb
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/all/val_ood_both/
 
 10k:
   dataset:
     train:
-      src: /network/projects/_groups/ocp/oc20/is2re/10k/train/data.lmdb #  data/is2re/10k/train/data.lmdb
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/10k/train
 
 100k:
   dataset:
     train:
-      src: /network/projects/_groups/ocp/oc20/is2re/100k/train/data.lmdb
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/is2re/100k/train
 
-all:
-  dataset:
-    train:
-      src: /network/projects/_groups/ocp/oc20/is2re/all/train/data.lmdb
+all: {}
\ No newline at end of file

From 253a067cace42bf1d0bf3c81d41500d71a6758dd Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 14:16:59 -0500
Subject: [PATCH 037/273] initial None objective

---
 ocpmodels/trainers/base_trainer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 8b5fa7e077..4a0216c074 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -66,6 +66,7 @@ def __init__(self, **kwargs):
         }
 
         self.sigterm = False
+        self.objective = None
         self.epoch = 0
         self.step = 0
         self.cpu = self.config["cpu"]

From f8d70d1ba0605633be8e16a2dd4d0419a47a1938 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Mon, 9 Jan 2023 09:32:34 -0500
Subject: [PATCH 038/273] Force MAE + many config files

---
 configs/exps/gnn/test-gnn-all-1.yaml          |  6 +++---
 configs/exps/icml/baseline_s2ef.yaml          | 20 ++++++++----------
 .../exps/prop-check/symmetries_s2ef_2.yaml    | 21 +++++--------------
 configs/models/dpp.yaml                       |  2 +-
 ocpmodels/trainers/base_trainer.py            |  3 ++-
 ocpmodels/trainers/single_trainer.py          |  6 ++++--
 6 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml
index 00a54ada07..ea5dd8ec56 100644
--- a/configs/exps/gnn/test-gnn-all-1.yaml
+++ b/configs/exps/gnn/test-gnn-all-1.yaml
@@ -17,7 +17,7 @@ default:
     energy_head: 'weighted-av-initial-embeds' # False ?
   wandb_tags: 'test-fanet'
   optim:
-    lr_initial: 0.0005
+    lr_initial: 0.0008
 
 runs:
   - config: fanet-is2re-all
@@ -61,7 +61,7 @@ runs:
       mp_type: updownscale
       edge_embed_type: all
     optim:
-      lr_initial: 0.0004
+      lr_initial: 0.0007
       max_epochs: 25
 
   - config: fanet-is2re-all
@@ -75,7 +75,7 @@ runs:
     frame_averaging: 2D
     fa_fames: random
     optim:
-      lr_initial: 0.0004
+      lr_initial: 0.0005
       max_epochs: 25
 
   - config: fanet-is2re-all
diff --git a/configs/exps/icml/baseline_s2ef.yaml b/configs/exps/icml/baseline_s2ef.yaml
index cefff0fb5e..154a4c07a5 100644
--- a/configs/exps/icml/baseline_s2ef.yaml
+++ b/configs/exps/icml/baseline_s2ef.yaml
@@ -1,27 +1,25 @@
 job:
   mem: 48GB
   cpus: 4
-  gres: gpu:rtx8000:2
+  gres: gpu:rtx8000:1
   partition: long
   time: 42:00:00
 
 default:
   test_ri: True
   mode: train
-  wandb_tags: 'baseline-schnet'
+  wandb_tags: 'Baseline dpp 1 Gpus'
 
 runs:
+  - config: dpp-s2ef-2M
+    note: 'Baseline Schnet S2EF'
+    optim:
+      batch_size: 368
+      eval_batch_size: 368
   - config: schnet-s2ef-2M
     note: 'Baseline Schnet S2EF 2 GPU'
     optim:
       max_epochs: 15
       force_coefficient: 50
-      batch_size: 96
-      eval_batch_size: 96
-  - config: schnet-is2re-2M
-    note: 'Baseline Schnet IS2RE 2 GPU'
-    optim:
-      max_epochs: 15
-      force_coefficient: 50
-      batch_size: 128
-      eval_batch_size: 128
+      batch_size: 192
+      eval_batch_size: 192
diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml
index aebe1b7934..9abfc02b40 100644
--- a/configs/exps/prop-check/symmetries_s2ef_2.yaml
+++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml
@@ -1,9 +1,9 @@
 job:
   mem: 48GB
   cpus: 4
-  gres: gpu:rtx8000:1
+  gres: gpu:rtx8000:4
   partition: long
-  time: 20:00:00
+  time: 40:00:00
 
 default:
   test_ri: True
@@ -16,19 +16,17 @@ default:
     energy_head: False # False ?
   optim:
     max_epochs: 5
-    batch_size: 196
-    eval_batch_size: 196
-  wandb_tags: 's2ef-sym-prop'
+  wandb_tags: 'prop-check-ICLM'
 
 runs:
   - config: sfarinet-s2ef-2M
-    note: 'Baseline 5 epochs 1 Gpu'
+    note: 'Baseline 5 epochs'
     frame_averaging: 2D
     fa_frames: all
     model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
-    note: 'Baseline 5 epochs 1 Gpu'
+    note: 'Baseline 5 epochs'
     frame_averaging: 3D
     fa_frames: all
     model:
@@ -77,12 +75,3 @@ runs:
       energy_grad_coefficient: 100
       force_coefficient: 30
       energy_coefficient: 0
-  - config: sfarinet-s2ef-2M
-    note: 'Large force coef'
-    frame_averaging: 2D
-    fa_frames: random
-    model:
-      regress_forces: direct_with_gradient_target
-    optim:
-      force_coefficient: 75
-      energy_coefficient: 1
\ No newline at end of file
diff --git a/configs/models/dpp.yaml b/configs/models/dpp.yaml
index 6a289bbc22..4b973595f4 100644
--- a/configs/models/dpp.yaml
+++ b/configs/models/dpp.yaml
@@ -119,7 +119,7 @@ s2ef:
         - 41666
       warmup_steps: 10416
       warmup_factor: 0.2
-      max_epochs: 15
+      max_epochs: 5
       force_coefficient: 50
     model:
       hidden_channels: 192
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 4a0216c074..72bfea076d 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -783,8 +783,9 @@ def eval_all_splits(
         if final and self.config["logger"] == "wandb" and distutils.is_master():
             overall_energy_mae = cumulated_energy_mae / len(all_splits)
             self.logger.log({"Eval time": cumulated_time})
-            self.logger.log({"Overall MAE": overall_energy_mae})
             self.objective = overall_energy_mae
+            self.logger.log({"Eval time": cumulated_time})
+            self.logger.log({"Overall MAE": overall_energy_mae})
             if self.config["model"].get("regress_forces", False):
                 overall_forces_mae = cumulated_forces_mae / len(all_splits)
                 self.logger.log({"Overall Forces MAE": overall_forces_mae})
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index bb14e1636d..e7e062ebb3 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -372,7 +372,9 @@ def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times)
             batch = next(iter(self.loaders[self.config["dataset"]["default_val"]]))
             self.model_forward(batch)
             self.logger.log({"Batch time": time.time() - start_time})
-            self.logger.log({"Model run time": model_run_time / n_train})
+            self.logger.log(
+                {"Model run time": model_run_time / len(self.loaders["train"])}
+            )
             if log_epoch_times:
                 self.logger.log({"Epoch time": np.mean(epoch_times)})
 
@@ -697,7 +699,7 @@ def test_model_symmetries(self, debug_batches=-1):
                 # assert torch.allclose(
                 #     torch.abs(
                 #         batch[0].force @ reflected["rot"].to(batch[0].force.device)
-                #         - reflected["batch_list"][0].force # .to(batch[0].force.device)
+                #         - reflected["batch_list"][0].force #.to(batch[0].force.device)
                 #     ).sum(),
                 #     torch.tensor([0.0]),   # .to(batch[0].force.device)
                 #     atol=1e-05,

From a8659db81ce7d4221cef8aa870e447aab0cc05a3 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 18:15:09 -0500
Subject: [PATCH 039/273] resume from orion

---
 main.py                   | 102 +++++------------------------
 ocpmodels/common/utils.py | 133 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 148 insertions(+), 87 deletions(-)

diff --git a/main.py b/main.py
index 90282124ac..da04d5cd93 100644
--- a/main.py
+++ b/main.py
@@ -25,12 +25,15 @@
 from ocpmodels.common.utils import (
     JOB_ID,
     build_config,
+    continue_from_slurm_job_id,
+    continue_orion_exp,
     merge_dicts,
+    move_lmdb_data_to_slurm_tmpdir,
+    read_slurm_env,
     resolve,
     setup_imports,
     setup_logging,
     update_from_sbatch_py_vars,
-    move_lmdb_data_to_slurm_tmpdir,
 )
 from ocpmodels.trainers import BaseTrainer
 
@@ -48,81 +51,6 @@
     )
 
 
-def read_slurm_env(config):
-    """
-    Parses the output of `scontrol show` in order to store the slurm
-    config (mem, cpu, node, gres) as a `"slurm"` key in the `config` object.
-
-    Args:
-        config (dict): Run configuration
-
-    Returns:
-        dict: Updated run config if no "slurm" key exists or it's empty
-    """
-    if not config.get("slurm"):
-        return config
-
-    command = f"scontrol show job {JOB_ID}"
-    scontrol = subprocess.check_output(command.split(" ")).decode("utf-8").strip()
-    params = re.findall(r"TRES=(.+)\n", scontrol)
-    try:
-        if params:
-            params = params[0]
-            for kv in params.split(","):
-                k, v = kv.split("=")
-                config["slurm"][k] = v
-    except Exception as e:
-        print("Slurm config creation exception", e)
-    finally:
-        return config
-
-
-def should_continue(config):
-    """
-    Assuming runs are consistently executed in a `run_dir` with the
-    `run_dir/$SLURM_JOBID` pattern, this functions looks for an existing
-    directory with the same $SLURM_JOBID as the current job that contains
-    a checkpoint.
-
-    If there is one, it tries to find `best_checkpoint.ckpt`.
-    If the latter does not exist, it looks for the latest checkpoint,
-    assuming a naming convention like `checkpoint-{step}.pt`.
-
-    If a checkpoint is found, its path is set in `config["checkpoint"]`.
-    Otherwise, returns the original config.
-
-    Args:
-        config (dict): The original config to overwrite
-
-    Returns:
-        dict: The updated config if a checkpoint has been found
-    """
-    if config.get("checkpoint"):
-        return config
-
-    job_id = os.environ.get("SLURM_JOBID")
-    if job_id is None:
-        return config
-
-    base_dir = Path(config["run_dir"]).resolve().parent
-    ckpt_dir = base_dir / job_id / "checkpoints"
-    if not ckpt_dir.exists() or not ckpt_dir.is_dir():
-        return config
-
-    best_ckp = ckpt_dir / "best_checkpoint.pt"
-    if best_ckp.exists():
-        config["checkpoint"] = str(best_ckp)
-    else:
-        ckpts = list(ckpt_dir.glob("checkpoint-*.pt"))
-        if not ckpts:
-            return config
-        latest_ckpt = sorted(ckpts, key=lambda f: f.stem)[-1]
-        if latest_ckpt.exists() and latest_ckpt.is_file():
-            config["checkpoint"] = str(latest_ckpt)
-
-    return config
-
-
 def print_warnings():
     warnings = [
         "`max_num_neighbors` is set to 40. This should be tuned per model.",
@@ -144,6 +72,10 @@ def __init__(self, trainer_config):
 
     def run(self, **hparams):
         self.original_config = copy.deepcopy(self.trainer_config)
+        if distutils.is_master():
+            orion_trial = hparams.pop("orion_trial", None)
+            if orion_trial:
+                hparams["orion_hash_params"] = orion_trial.hash_params
         self.hparams = hparams
 
         should_be_0 = distutils.get_rank()
@@ -158,6 +90,7 @@ def run(self, **hparams):
             print(hparams)
 
         self.trainer_config = merge_dicts(self.trainer_config, hparams)
+        self.trainer_config = continue_orion_exp(self.trainer_config)
         cls = registry.get_trainer_class(self.trainer_config["trainer"])
         self.trainer: BaseTrainer = cls(**self.trainer_config)
         task = registry.get_task_class(self.trainer_config["mode"])(self.trainer_config)
@@ -224,28 +157,23 @@ def run(self, **hparams):
         # -------------------
         setup_imports()
         print("All things imported.")
-        trainer_config = should_continue(trainer_config)
+        trainer_config = continue_from_slurm_job_id(trainer_config)
         trainer_config = read_slurm_env(trainer_config)
         runner = Runner(trainer_config)
         print("Runner ready.")
         # -------------------
         # -----  Train  -----
         # -------------------
-        if args.orion_search and distutils.is_master():
-            assert args.unique_exp_name
-            space = safe_load(Path(args.orion_search).read_text())
+        if args.orion_search_path and distutils.is_master():
+            assert args.orion_unique_exp_name
+            space = safe_load(Path(args.orion_search_path).read_text())
             print("Search Space: ", space)
             experiment = build_experiment(
-                name=args.unique_exp_name,
+                name=args.orion_unique_exp_name,
                 space=space,
                 algorithms={"mofa": {"seed": 123}},
             )
-            experiment.workon(
-                runner.run,
-                max_trials_per_worker=1,
-                n_workers=1,
-                idle_timeout=3600 * 24 * 4,
-            )
+            experiment.workon(runner.run, max_trials_per_worker=1, n_workers=1)
         else:
             print("Starting runner.")
             runner.run()
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index fe2a524c46..cd4f426032 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -43,6 +43,139 @@
 JOB_ID = os.environ.get("SLURM_JOB_ID")
 
 
+def continue_orion_exp(trainer_config):
+    if not trainer_config.get("orion_search_path") or not trainer_config.get(
+        "orion_unique_exp_name"
+    ):
+        return trainer_config
+
+    if "orion_hash_params" not in trainer_config:
+        faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml"
+        print(
+            "\n\nWARNING: trainer_config has 'orion_search_path' and 'orion_unique_exp_name'",
+            "but no 'orion_trial'. This can lead to inconsistencies.",
+            f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n",
+        )
+        faulty_path.write_text(yaml.dump(trainer_config))
+        return trainer_config
+
+    hash_params = trainer_config["orion_hash_params"]
+    exp_name = trainer_config["orion_unique_exp_name"]
+    id_file = f"{exp_name}--{hash_params}.unique"
+    (Path(trainer_config["run_dir"]) / id_file).touch()
+    base_dir = Path(trainer_config["run_dir"]).parent
+    existing_id_files = list(base_dir.glob(f"*/{id_file}"))
+
+    if not existing_id_files:
+        return trainer_config
+
+    latest_dirs = sorted(
+        [
+            f.parent
+            for f in existing_id_files
+            if float(f.parent.name) != float(trainer_config["job_id"])
+        ],
+        key=lambda f: float(f.name),
+    )
+
+    if not latest_dirs:
+        return trainer_config
+
+    latest_ckpts = sorted(
+        [f for f in (latest_dirs[-1] / "checkpoints").glob("checkpoint-*")],
+        key=lambda f: float(f.stem.split("-")[-1]),
+    )
+
+    if not latest_ckpts:
+        raise ValueError(f"No checkpoint found in {str(latest_dirs[-1])}")
+    trainer_config["checkpoint"] = str(latest_ckpts[-1])
+    print(
+        f"\nFound {len(latest_ckpts)} existing Orion runs.",
+        "Resuming from latest:",
+        str(latest_dirs[-1]),
+    )
+    print("Based on unique file id:", id_file)
+    print("Continuing from checkpoint:", trainer_config["checkpoint"], end="\n\n")
+    return trainer_config
+
+
+def read_slurm_env(config):
+    """
+    Parses the output of `scontrol show` in order to store the slurm
+    config (mem, cpu, node, gres) as a `"slurm"` key in the `config` object.
+
+    Args:
+        config (dict): Run configuration
+
+    Returns:
+        dict: Updated run config if no "slurm" key exists or it's empty
+    """
+    if not config.get("slurm"):
+        return config
+
+    command = f"scontrol show job {JOB_ID}"
+    scontrol = subprocess.check_output(command.split(" ")).decode("utf-8").strip()
+    params = re.findall(r"TRES=(.+)\n", scontrol)
+    try:
+        if params:
+            params = params[0]
+            for kv in params.split(","):
+                k, v = kv.split("=")
+                config["slurm"][k] = v
+    except Exception as e:
+        print("Slurm config creation exception", e)
+    finally:
+        return config
+
+
+def continue_from_slurm_job_id(config):
+    """
+    Assuming runs are consistently executed in a `run_dir` with the
+    `run_dir/$SLURM_JOBID` pattern, this functions looks for an existing
+    directory with the same $SLURM_JOBID as the current job that contains
+    a checkpoint.
+
+    If there is one, it tries to find `best_checkpoint.ckpt`.
+    If the latter does not exist, it looks for the latest checkpoint,
+    assuming a naming convention like `checkpoint-{step}.pt`.
+
+    If a checkpoint is found, its path is set in `config["checkpoint"]`.
+    Otherwise, returns the original config.
+
+    Args:
+        config (dict): The original config to overwrite
+
+    Returns:
+        dict: The updated config if a checkpoint has been found
+    """
+    if config.get("checkpoint"):
+        return config
+
+    job_id = os.environ.get("SLURM_JOBID")
+    if job_id is None:
+        return config
+
+    base_dir = Path(config["run_dir"]).resolve().parent
+    ckpt_dir = base_dir / job_id / "checkpoints"
+    if not ckpt_dir.exists() or not ckpt_dir.is_dir():
+        return config
+
+    best_ckp = ckpt_dir / "best_checkpoint.pt"
+    if best_ckp.exists():
+        config["checkpoint"] = str(best_ckp)
+    else:
+        ckpts = list(ckpt_dir.glob("checkpoint-*.pt"))
+        if not ckpts:
+            return config
+        latest_ckpt = sorted(
+            ckpts, key=lambda f: float(f.stem.split("checkpoint-")[-1])
+        )[-1]
+        if latest_ckpt.exists() and latest_ckpt.is_file():
+            config["checkpoint"] = str(latest_ckpt)
+
+    return config
+
+
 def move_lmdb_data_to_slurm_tmpdir(trainer_config):
     if (
         not trainer_config.get("cp_data_to_tmpdir")

From 99ecab351e54030e8d19c0237d5feb3c0ab5658c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 9 Jan 2023 18:15:14 -0500
Subject: [PATCH 040/273] v0 orion exp launch

---
 launch_exp.py             | 47 ++++++++++++++++++++++++++++++++++-----
 ocpmodels/common/flags.py |  2 +-
 2 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index 9196bfb883..7c0e9e7303 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -5,11 +5,13 @@
 from pathlib import Path
 
 from minydra import resolved_args
-from yaml import safe_load
+from yaml import safe_load, dump
 
 from sbatch import now
 import copy
 
+ROOT = Path(__file__).resolve().parent
+
 
 def util_strings(jobs, yaml_comments=False):
     s = "All jobs launched: " + ", ".join(jobs)
@@ -103,7 +105,7 @@ def get_commit():
 
 
 def find_exp(name):
-    exp_dir = Path(__file__).parent / "configs" / "exps"
+    exp_dir = ROOT / "configs" / "exps"
     exp_file = exp_dir / f"{name}.yaml"
     if exp_file.exists():
         return exp_file
@@ -139,16 +141,51 @@ def cli_arg(args, key=""):
 
 
 if __name__ == "__main__":
+    orion_conf = ROOT / "data" / "orion" / "orion_config.yaml"
     args = resolved_args()
     assert "exp" in args
     regex = args.get("match", ".*")
+    ts = now()
 
     exp_name = args.exp.replace(".yml", "").replace(".yaml", "")
     exp_file = find_exp(exp_name)
 
     exp = safe_load(exp_file.open("r"))
 
-    runs = exp["runs"]
+    if "orion" in exp:
+        assert "runs" not in exp, "Cannot use both Orion and runs"
+        assert (
+            "orion_unique_exp_name" in exp
+        ), "Must specify 'orion_unique_exp_name' in exp file"
+        if not orion_conf.exists():
+            orion_conf.write_text(
+                dump(
+                    {
+                        "storage": {
+                            "database": {
+                                "host": str(orion_conf.parent / "orion_db.pkl"),
+                                "type": "pickleddb",
+                            }
+                        }
+                    }
+                )
+            )
+        search_path = (
+            orion_conf.parent
+            / "search-spaces"
+            / f"{ts}-{exp['orion_unique_exp_name']}.yaml"
+        )
+        search_path.parent.mkdir(exist_ok=True, parents=True)
+        assert not search_path.exists()
+        search_path.write_text(dump(exp["orion"]))
+        runs = [
+            {
+                "orion_search_path": str(search_path),
+                "orion_unique_exp_name": exp["orion_unique_exp_name"],
+            }
+        ]
+    else:
+        runs = exp["runs"]
 
     commands = []
 
@@ -191,8 +228,8 @@ def cli_arg(args, key=""):
             print(f"Launching job {c:3}", end="\r") or os.popen(command).read().strip()
             for c, command in enumerate(commands)
         ]
-        outdir = Path(__file__).resolve().parent / "data" / "exp_outputs" / exp_name
-        outfile = outdir / f"{exp_name.split('/')[-1]}_{now()}.txt"
+        outdir = ROOT / "data" / "exp_outputs" / exp_name
+        outfile = outdir / f"{exp_name.split('/')[-1]}_{ts}.txt"
         outfile.parent.mkdir(exist_ok=True, parents=True)
         text += separator.join(outputs)
         jobs = [
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 354b52ee75..9828598ca4 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -241,7 +241,7 @@ def add_core_args(self):
             help="Log training loss every n steps",
         )
         self.parser.add_argument(
-            "--orion_search",
+            "--orion_search_path",
             "-o",
             type=str,
             help="Path to an orion search space yaml file",

From 0a0d8a911c3b2a61693ac6c7afbe2903b5936050 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Mon, 9 Jan 2023 18:22:11 -0500
Subject: [PATCH 041/273] update config files

---
 configs/exps/gnn/test-gnn-all-1.yaml          |  6 ++---
 configs/exps/icml/baseline_s2ef.yaml          | 20 +++++++++-------
 configs/exps/prop-check/symmetries.yaml       | 24 +++++++++----------
 .../exps/prop-check/symmetries_s2ef_2.yaml    | 21 ++++++++++++----
 configs/models/dpp.yaml                       |  2 +-
 configs/models/fanet.yaml                     |  4 ++--
 ocpmodels/trainers/single_trainer.py          |  2 +-
 7 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/configs/exps/gnn/test-gnn-all-1.yaml b/configs/exps/gnn/test-gnn-all-1.yaml
index ea5dd8ec56..00a54ada07 100644
--- a/configs/exps/gnn/test-gnn-all-1.yaml
+++ b/configs/exps/gnn/test-gnn-all-1.yaml
@@ -17,7 +17,7 @@ default:
     energy_head: 'weighted-av-initial-embeds' # False ?
   wandb_tags: 'test-fanet'
   optim:
-    lr_initial: 0.0008
+    lr_initial: 0.0005
 
 runs:
   - config: fanet-is2re-all
@@ -61,7 +61,7 @@ runs:
       mp_type: updownscale
       edge_embed_type: all
     optim:
-      lr_initial: 0.0007
+      lr_initial: 0.0004
       max_epochs: 25
 
   - config: fanet-is2re-all
@@ -75,7 +75,7 @@ runs:
     frame_averaging: 2D
     fa_fames: random
     optim:
-      lr_initial: 0.0005
+      lr_initial: 0.0004
       max_epochs: 25
 
   - config: fanet-is2re-all
diff --git a/configs/exps/icml/baseline_s2ef.yaml b/configs/exps/icml/baseline_s2ef.yaml
index 154a4c07a5..cefff0fb5e 100644
--- a/configs/exps/icml/baseline_s2ef.yaml
+++ b/configs/exps/icml/baseline_s2ef.yaml
@@ -1,25 +1,27 @@
 job:
   mem: 48GB
   cpus: 4
-  gres: gpu:rtx8000:1
+  gres: gpu:rtx8000:2
   partition: long
   time: 42:00:00
 
 default:
   test_ri: True
   mode: train
-  wandb_tags: 'Baseline dpp 1 Gpus'
+  wandb_tags: 'baseline-schnet'
 
 runs:
-  - config: dpp-s2ef-2M
-    note: 'Baseline Schnet S2EF'
-    optim:
-      batch_size: 368
-      eval_batch_size: 368
   - config: schnet-s2ef-2M
     note: 'Baseline Schnet S2EF 2 GPU'
     optim:
       max_epochs: 15
       force_coefficient: 50
-      batch_size: 192
-      eval_batch_size: 192
+      batch_size: 96
+      eval_batch_size: 96
+  - config: schnet-is2re-2M
+    note: 'Baseline Schnet IS2RE 2 GPU'
+    optim:
+      max_epochs: 15
+      force_coefficient: 50
+      batch_size: 128
+      eval_batch_size: 128
diff --git a/configs/exps/prop-check/symmetries.yaml b/configs/exps/prop-check/symmetries.yaml
index 0b26ce2d81..1da0808595 100644
--- a/configs/exps/prop-check/symmetries.yaml
+++ b/configs/exps/prop-check/symmetries.yaml
@@ -21,69 +21,69 @@ default:
 runs:
   - config: sfarinet-s2ef-2M
     note: 'Baseline 5 epochs'
-    model: 
+    model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 2D
     fa_frames: all
-    model: 
+    model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 3D
     fa_frames: all
-    model: 
+    model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: DA
-    model: 
+    model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 2D
     fa_frames: det
-    model: 
+    model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 2D
     fa_frames: se3-det
-    model: 
+    model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 2D
     fa_frames: all
-    model: 
+    model:
       regress_forces: direct
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 2D
     fa_frames: se3-random
-    model: 
+    model:
       regress_forces: direct
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: 2D
     fa_frames: all
-    model: 
+    model:
       regress_forces: direct_with_gradient_target
   - config: sfarinet-s2ef-2M
     note: 'Test Force Equivariance'
     frame_averaging: DA
-    model: 
+    model:
       regress_forces: direct_with_gradient_target
   - config: sfarinet-s2ef-2M
     note: 'No forces coefficient ! Only energy'
-    model: 
+    model:
       regress_forces: direct
       force_coefficient: 0
       energy_grad_coefficient: 10
   - config: sfarinet-s2ef-2M
     note: 'Large energy grad coef'
     frame_averaging: DA
-    model: 
+    model:
       regress_forces: direct_with_gradient_target
       energy_grad_coefficient: 50
diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml
index 9abfc02b40..aebe1b7934 100644
--- a/configs/exps/prop-check/symmetries_s2ef_2.yaml
+++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml
@@ -1,9 +1,9 @@
 job:
   mem: 48GB
   cpus: 4
-  gres: gpu:rtx8000:4
+  gres: gpu:rtx8000:1
   partition: long
-  time: 40:00:00
+  time: 20:00:00
 
 default:
   test_ri: True
@@ -16,17 +16,19 @@ default:
     energy_head: False # False ?
   optim:
     max_epochs: 5
-  wandb_tags: 'prop-check-ICLM'
+    batch_size: 196
+    eval_batch_size: 196
+  wandb_tags: 's2ef-sym-prop'
 
 runs:
   - config: sfarinet-s2ef-2M
-    note: 'Baseline 5 epochs'
+    note: 'Baseline 5 epochs 1 Gpu'
     frame_averaging: 2D
     fa_frames: all
     model:
       regress_forces: from_energy
   - config: sfarinet-s2ef-2M
-    note: 'Baseline 5 epochs'
+    note: 'Baseline 5 epochs 1 Gpu'
     frame_averaging: 3D
     fa_frames: all
     model:
@@ -75,3 +77,12 @@ runs:
       energy_grad_coefficient: 100
       force_coefficient: 30
       energy_coefficient: 0
+  - config: sfarinet-s2ef-2M
+    note: 'Large force coef'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      force_coefficient: 75
+      energy_coefficient: 1
\ No newline at end of file
diff --git a/configs/models/dpp.yaml b/configs/models/dpp.yaml
index 4b973595f4..6a289bbc22 100644
--- a/configs/models/dpp.yaml
+++ b/configs/models/dpp.yaml
@@ -119,7 +119,7 @@ s2ef:
         - 41666
       warmup_steps: 10416
       warmup_factor: 0.2
-      max_epochs: 5
+      max_epochs: 15
       force_coefficient: 50
     model:
       hidden_channels: 192
diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml
index 5bf8ad3546..b04d7dfba7 100644
--- a/configs/models/fanet.yaml
+++ b/configs/models/fanet.yaml
@@ -85,7 +85,7 @@ is2re:
 # -----  S2EF  -----
 # ------------------
 
-# For 4 GPUs 
+# For 4 GPUs
 
 s2ef:
   default:
@@ -115,7 +115,7 @@ s2ef:
 
   # 2 gpus
   2M:
-    model: 
+    model:
       num_interactions: 5
       hidden_channels: 1024
       num_gaussians: 200
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index e7e062ebb3..6f9234d45a 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -684,7 +684,7 @@ def test_model_symmetries(self, debug_batches=-1):
                 # Compute total difference across frames
                 for pos1, pos2 in zip(batch[0].fa_pos, rotated["batch_list"][0].fa_pos):
                     pos_diff += pos1 - pos2
-                # Manhanttan distance of pos matrix wrt 0 matrix.
+                # Manhattan distance of pos matrix wrt 0 matrix.
                 pos_diff_total += torch.abs(pos_diff).sum()
 
             # Reflect graph and compute diff in prediction

From 30eb6b7a17c49a10fb50dbecaf852f65e498c7f7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 01:05:01 -0500
Subject: [PATCH 042/273] improve Orion setup

---
 configs/exps/debug/orion.yaml | 55 +++++++++++++++++++++++++++++++++++
 launch_exp.py                 | 43 +++++++++++----------------
 main.py                       | 11 ++++++-
 3 files changed, 82 insertions(+), 27 deletions(-)
 create mode 100644 configs/exps/debug/orion.yaml

diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml
new file mode 100644
index 0000000000..e913e42cef
--- /dev/null
+++ b/configs/exps/debug/orion.yaml
@@ -0,0 +1,55 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 48GB
+  cpus: 4
+  gres: gpu:16gb:1
+  time: 1:00:00
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
+  env: ocp-a100
+
+default:
+  wandb_project: ocp-qm
+  config: schnet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, orion-debug
+  phys_hidden_channels: 0
+  phys_embeds: False
+  energy_head: False
+  pg_hidden_channels: 0
+  tag_hidden_channels: 0
+  frame_averaging: ""
+  cp_data_to_tmpdir: true
+  optim:
+    batch_size: 64
+    warmup_steps: 3000
+    lr_initial: 0.0002
+    # parameters EMA
+    ema_decay: 0.999
+    # exp. decay to 0.01 * lr_initial in 1000000 steps
+    decay_steps: max_steps
+    decay_rate: 0.05 # at the end of training, lr is decay_rate*lr_initial
+    # max_epochs = ref_steps[3e6] / (n_train[110 000] / ref_batch_size[32])
+    max_epochs: -1
+    max_steps: 3000000
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  _meta_:
+    n_runs: 2
+    unique_exp_name: ocp-qm9-orion-debug
+  optim:
+    batch_size: uniform(32, 1024, discrete=True)
+    lr_initial: loguniform(1e-5, 5e-3, precision=2)
+    max_steps: fidelity(1e4, 1e6, base=5e5)
+  model:
+    num_gaussians: uniform(16, 200, base=20, discrete=True)
+    hidden_channels: uniform(32, 512, discrete=True)
+    num_filters: uniform(32, 512, discrete=True)
+    num_interactions: uniform(1, 7, discrete=True)
+    phys_embeds: choices([True, False])
\ No newline at end of file
diff --git a/launch_exp.py b/launch_exp.py
index 7c0e9e7303..34999ce607 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -80,12 +80,14 @@ def write_exp_yaml_and_jobs(exp_file, outfile, jobs):
         jobs (list[str]): List of jobs, one per run line in the yaml exp_file
     """
     lines = exp_file.read_text().splitlines()
-    run_line = lines.index("runs:")
-    j = 0
-    for i, line in enumerate(lines[run_line:]):
-        if line.strip().startswith("- "):
-            lines[run_line + i] = f"{line}  # {jobs[j]}"
-            j += 1
+    if "runs:" in lines:
+        run_line = lines.index("runs:")
+        j = 0
+        for i, line in enumerate(lines[run_line:]):
+            if line.strip().startswith("- "):
+                lines[run_line + i] = f"{line}  # {jobs[j]}"
+                j += 1
+
     lines += [""] + util_strings(jobs, True).splitlines()
     yml_out = outfile.with_suffix(".yaml")
     yml_out.write_text("\n".join(lines))
@@ -141,7 +143,6 @@ def cli_arg(args, key=""):
 
 
 if __name__ == "__main__":
-    orion_conf = ROOT / "data" / "orion" / "orion_config.yaml"
     args = resolved_args()
     assert "exp" in args
     regex = args.get("match", ".*")
@@ -153,27 +154,16 @@ def cli_arg(args, key=""):
     exp = safe_load(exp_file.open("r"))
 
     if "orion" in exp:
+        orion_base = ROOT / "data" / "orion"
         assert "runs" not in exp, "Cannot use both Orion and runs"
+        meta = exp["orion"].pop("_meta_", {})
         assert (
-            "orion_unique_exp_name" in exp
-        ), "Must specify 'orion_unique_exp_name' in exp file"
-        if not orion_conf.exists():
-            orion_conf.write_text(
-                dump(
-                    {
-                        "storage": {
-                            "database": {
-                                "host": str(orion_conf.parent / "orion_db.pkl"),
-                                "type": "pickleddb",
-                            }
-                        }
-                    }
-                )
-            )
+            "unique_exp_name" in meta
+        ), "Must specify 'orion._meta_.unique_exp_name' in exp file"
+        assert "n_runs" in meta, "Must specify 'orion._meta_.n_runs' in exp file"
+
         search_path = (
-            orion_conf.parent
-            / "search-spaces"
-            / f"{ts}-{exp['orion_unique_exp_name']}.yaml"
+            orion_base / "search-spaces" / f"{ts}-{meta['unique_exp_name']}.yaml"
         )
         search_path.parent.mkdir(exist_ok=True, parents=True)
         assert not search_path.exists()
@@ -181,8 +171,9 @@ def cli_arg(args, key=""):
         runs = [
             {
                 "orion_search_path": str(search_path),
-                "orion_unique_exp_name": exp["orion_unique_exp_name"],
+                "orion_unique_exp_name": meta["unique_exp_name"],
             }
+            for _ in range(meta["n_runs"])
         ]
     else:
         runs = exp["runs"]
diff --git a/main.py b/main.py
index da04d5cd93..2fc13171e3 100644
--- a/main.py
+++ b/main.py
@@ -24,6 +24,7 @@
 from ocpmodels.common.registry import registry
 from ocpmodels.common.utils import (
     JOB_ID,
+    ROOT,
     build_config,
     continue_from_slurm_job_id,
     continue_orion_exp,
@@ -169,9 +170,17 @@ def run(self, **hparams):
             space = safe_load(Path(args.orion_search_path).read_text())
             print("Search Space: ", space)
             experiment = build_experiment(
+                storage={
+                    "database": {
+                        "host": str(
+                            ROOT / "data" / "orion" / "storage" / "orion_db.pkl"
+                        ),
+                        "type": "pickleddb",
+                    }
+                },
                 name=args.orion_unique_exp_name,
                 space=space,
-                algorithms={"mofa": {"seed": 123}},
+                algorithms={"asha": {"seed": 123}},
             )
             experiment.workon(runner.run, max_trials_per_worker=1, n_workers=1)
         else:

From dc3ddca37a449e3e3842c7a32618b0bd05b43fd1 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 01:14:26 -0500
Subject: [PATCH 043/273] update flags

---
 configs/exps/debug/orion.yaml | 8 ++++----
 ocpmodels/common/flags.py     | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml
index e913e42cef..1e23cf9299 100644
--- a/configs/exps/debug/orion.yaml
+++ b/configs/exps/debug/orion.yaml
@@ -1,11 +1,11 @@
 # more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
 job:
-  mem: 48GB
+  mem: 32GB
   cpus: 4
-  gres: gpu:16gb:1
+  gres: gpu:1
   time: 1:00:00
-  partition: long
-  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
+  partition: main
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
   env: ocp-a100
 
 default:
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 9828598ca4..8142c80858 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -247,7 +247,7 @@ def add_core_args(self):
             help="Path to an orion search space yaml file",
         )
         self.parser.add_argument(
-            "--unique_exp_name",
+            "--orion_unique_exp_name",
             "-u",
             type=str,
             help="Name for this experiment. If the experiment name already exists,"

From 4356249038064c5c59626328a88ff41cbf2a3002 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 02:30:31 -0500
Subject: [PATCH 044/273] orion v0.1

---
 configs/exps/debug/orion.yaml        |  6 ++--
 main.py                              | 49 ++++++++++++++--------------
 ocpmodels/common/utils.py            |  5 +--
 ocpmodels/trainers/single_trainer.py |  2 +-
 4 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml
index 1e23cf9299..4a02537aa9 100644
--- a/configs/exps/debug/orion.yaml
+++ b/configs/exps/debug/orion.yaml
@@ -5,7 +5,7 @@ job:
   gres: gpu:1
   time: 1:00:00
   partition: main
-  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+  code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
   env: ocp-a100
 
 default:
@@ -42,13 +42,13 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   _meta_:
     n_runs: 2
-    unique_exp_name: ocp-qm9-orion-debug
+    unique_exp_name: ocp-qm9-orion-debug-v0.0.2
   optim:
     batch_size: uniform(32, 1024, discrete=True)
     lr_initial: loguniform(1e-5, 5e-3, precision=2)
     max_steps: fidelity(1e4, 1e6, base=5e5)
   model:
-    num_gaussians: uniform(16, 200, base=20, discrete=True)
+    num_gaussians: uniform(16, 200, discrete=True)
     hidden_channels: uniform(32, 512, discrete=True)
     num_filters: uniform(32, 512, discrete=True)
     num_interactions: uniform(1, 7, discrete=True)
diff --git a/main.py b/main.py
index 2fc13171e3..03c4401b38 100644
--- a/main.py
+++ b/main.py
@@ -8,8 +8,6 @@
 import copy
 import logging
 import os
-import re
-import subprocess
 import time
 import traceback
 import warnings
@@ -70,27 +68,29 @@ class Runner:
     def __init__(self, trainer_config):
         self.trainer_config = trainer_config
         self.trainer = None
+        self.hparams = {}
 
-    def run(self, **hparams):
+    def run(self, orion_exp=None):
+        orion_trial = None
         self.original_config = copy.deepcopy(self.trainer_config)
         if distutils.is_master():
-            orion_trial = hparams.pop("orion_trial", None)
-            if orion_trial:
-                hparams["orion_hash_params"] = orion_trial.hash_params
-        self.hparams = hparams
+            if orion_exp:
+                orion_trial = orion_exp.suggest(1)
+                self.hparams = orion_trial.params
+                self.hparams["orion_hash_params"] = orion_trial.hash_params
 
         should_be_0 = distutils.get_rank()
-        hp_list = [hparams, should_be_0]
+        hp_list = [self.hparams, should_be_0]
         # print("hparams pre-broadcast: ", hparams)
         distutils.broadcast_object_list(hp_list)
-        hparams, should_be_0 = hp_list
+        self.hparams, should_be_0 = hp_list
         # print("hparams post-broadcast: ", hparams)
         assert should_be_0 == 0
-        if hparams:
+        if self.hparams:
             print("Received hyper-parameters from Orion:")
-            print(hparams)
+            print(self.hparams)
 
-        self.trainer_config = merge_dicts(self.trainer_config, hparams)
+        self.trainer_config = merge_dicts(self.trainer_config, self.hparams)
         self.trainer_config = continue_orion_exp(self.trainer_config)
         cls = registry.get_trainer_class(self.trainer_config["trainer"])
         self.trainer: BaseTrainer = cls(**self.trainer_config)
@@ -118,7 +118,8 @@ def run(self, **hparams):
         objective = o_list[0]
         # print("objective post-broadcast: ", objective)
 
-        return [{"name": "energy_mae", "type": "objective", "value": objective}]
+        if orion_exp is not None:
+            orion_exp.observe(orion_trial, objective, name="energy_mae")
 
 
 if __name__ == "__main__":
@@ -141,7 +142,6 @@ def run(self, **hparams):
     trainer_config = build_config(args, override_args)
     trainer_config["optim"]["eval_batch_size"] = trainer_config["optim"]["batch_size"]
 
-    setup_logging()
     original_trainer_config = copy.deepcopy(trainer_config)
 
     if args.distributed:
@@ -152,16 +152,17 @@ def run(self, **hparams):
         trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config)
         # distutils.synchronize()
 
+    # -------------------
+    # -----  Setup  -----
+    # -------------------
+    setup_imports()
+    print("All things imported.")
+    trainer_config = continue_from_slurm_job_id(trainer_config)
+    trainer_config = read_slurm_env(trainer_config)
+    runner = Runner(trainer_config)
+    print("Runner ready.")
+
     try:
-        # -------------------
-        # -----  Setup  -----
-        # -------------------
-        setup_imports()
-        print("All things imported.")
-        trainer_config = continue_from_slurm_job_id(trainer_config)
-        trainer_config = read_slurm_env(trainer_config)
-        runner = Runner(trainer_config)
-        print("Runner ready.")
         # -------------------
         # -----  Train  -----
         # -------------------
@@ -182,7 +183,7 @@ def run(self, **hparams):
                 space=space,
                 algorithms={"asha": {"seed": 123}},
             )
-            experiment.workon(runner.run, max_trials_per_worker=1, n_workers=1)
+            runner.run(orion_exp=experiment)
         else:
             print("Starting runner.")
             runner.run()
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index cd4f426032..35a63ceead 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -52,8 +52,9 @@ def continue_orion_exp(trainer_config):
     if "orion_hash_params" not in trainer_config:
         faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml"
         print(
-            "\n\nWARNING: trainer_config has 'orion_search_path' and 'orion_unique_exp_name'",
-            "but no 'orion_trial'. This can lead to inconsistencies.",
+            "\n\nWARNING: trainer_config has 'orion_search_path' and",
+            "'orion_unique_exp_name' but no 'orion_hash_params'.",
+            "This can lead to inconsistencies.",
             f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n",
         )
         faulty_path.write_text(yaml.dump(trainer_config))
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 6f9234d45a..e91a6ea9ba 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -229,7 +229,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 if self.sigterm:
                     return "SIGTERM"
                 i_for_epoch += 1
-                print(self.now, "i_for_epoch: ", i_for_epoch, flush=True)
+                # print(self.now, "i_for_epoch: ", i_for_epoch, flush=True)
                 self.epoch = epoch_int + (i + 1) / n_train
                 self.step = epoch_int * n_train + i + 1
 

From 93d5f139145cf2c2afe5c9aac1f4d5dfcf7c2225 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 02:38:54 -0500
Subject: [PATCH 045/273] missing args to end_of_training() calls

---
 ocpmodels/trainers/single_trainer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index e91a6ea9ba..ec30642ca4 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -331,7 +331,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                         print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n")
                         if self.logger:
                             self.logger.add_tags(["E-S"])
-                        return self.end_of_training()
+                        return self.end_of_training(
+                            epoch_int, debug_batches, model_run_time, epoch_times
+                        )
 
                     self.model.train()
 
@@ -350,7 +352,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
 
         # End of training.
         if not is_test_env:
-            return self.end_of_training()
+            return self.end_of_training(
+                epoch_int, debug_batches, model_run_time, epoch_times
+            )
 
     def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times):
 

From 182e081273a8249a414632419a880d7b16ad2c43 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 02:53:43 -0500
Subject: [PATCH 046/273] validate on test for qm9

---
 configs/models/tasks/qm9.yaml | 1 +
 ocpmodels/common/flags.py     | 3 +--
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml
index 42c64032f8..262ec232e7 100644
--- a/configs/models/tasks/qm9.yaml
+++ b/configs/models/tasks/qm9.yaml
@@ -1,6 +1,7 @@
 default:
   trainer: single
   logger: wandb
+  eval_on_test: True
 
   model:
     otf_graph: False
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 8142c80858..e1d19fbb5f 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -218,8 +218,7 @@ def add_core_args(self):
         )
         self.parser.add_argument(
             "--eval_on_test",
-            action="store_true",
-            default=False,
+            type=bool,
             help="Evaluate on test set",
         )
         self.parser.add_argument(

From 512400847a3cf74fc4a15bf1346d12d68b71490a Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Tue, 10 Jan 2023 06:38:50 -0500
Subject: [PATCH 047/273] add edge embed option to sfarinet

---
 configs/exps/gnn/s2ef_1gpu.yaml | 47 +++++++++++++++++++
 configs/models/sfarinet.yaml    |  7 ++-
 ocpmodels/models/sfarinet.py    | 81 ++++++++++++++++++++++++++++-----
 3 files changed, 123 insertions(+), 12 deletions(-)
 create mode 100644 configs/exps/gnn/s2ef_1gpu.yaml

diff --git a/configs/exps/gnn/s2ef_1gpu.yaml b/configs/exps/gnn/s2ef_1gpu.yaml
new file mode 100644
index 0000000000..4decd7316f
--- /dev/null
+++ b/configs/exps/gnn/s2ef_1gpu.yaml
@@ -0,0 +1,47 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 30:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  cp_data_to_tmp: true
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+    regress_forces: direct_with_gradient_target
+  wandb_tags: 's2ef-archi-tests'
+  optim:
+    max_epochs: 10
+    batch_size: 192
+    eval_batch_size: 192
+
+runs:
+  - config: sfarinet-s2ef-2M
+    note: 'Bigger forces coef'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    optim:
+      force_coefficient: 75
+  - config: sfarinet-s2ef-2M
+    note: 'Bigger forces coef'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    model:
+      regress_forces: direct
+    optim:
+      force_coefficient: 75
+  - config: sfarinet-s2ef-2M
+    note: 'Bigger forces coef'
+    frame_averaging: 2D
+    fa_fames: se3-random
+    model:
+      regress_forces: direct
+    optim:
+      force_coefficient: 75
diff --git a/configs/models/sfarinet.yaml b/configs/models/sfarinet.yaml
index 57bc1afdec..0e2d82d993 100644
--- a/configs/models/sfarinet.yaml
+++ b/configs/models/sfarinet.yaml
@@ -15,6 +15,7 @@ default:
     phys_embeds: False # True
     phys_hidden_channels: 0
     energy_head: False # can be {False, weighted-av-initial-embeds, weighted-av-final-embeds, pooling, graclus, random}
+    edge_embed_type: ""
     force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True
     force_decoder_model_config:
       simple:
@@ -89,7 +90,7 @@ is2re:
 # -----  S2EF  -----
 # ------------------
 
-# For 4 GPUs 
+# For 4 GPUs
 
 s2ef:
   default:
@@ -123,6 +124,10 @@ s2ef:
 
   all: {}
 
+# ------------------
+# -----  QM9  -----
+# ------------------
+
 qm9:
   default:
     model:
diff --git a/ocpmodels/models/sfarinet.py b/ocpmodels/models/sfarinet.py
index f55fc414aa..da3ad2a985 100644
--- a/ocpmodels/models/sfarinet.py
+++ b/ocpmodels/models/sfarinet.py
@@ -2,18 +2,19 @@
 """
 
 import torch
+from e3nn.o3 import spherical_harmonics
 from torch import nn
 from torch.nn import Embedding, Linear
 from torch_geometric.nn import MessagePassing, radius_graph
 from torch_scatter import scatter
 
 from ocpmodels.common.registry import registry
-from ocpmodels.common.utils import get_pbc_distances, conditional_grad
+from ocpmodels.common.utils import conditional_grad, get_pbc_distances
 from ocpmodels.models.base_model import BaseModel
+from ocpmodels.models.force_decoder import ForceDecoder
 from ocpmodels.models.utils.pos_encodings import PositionalEncoding
 from ocpmodels.modules.phys_embeddings import PhysEmbedding
 from ocpmodels.modules.pooling import Graclus, Hierarchical_Pooling
-from ocpmodels.models.force_decoder import ForceDecoder
 
 try:
     from torch_geometric.nn.acts import swish
@@ -47,6 +48,7 @@ def __init__(
         phys_embeds,
         graph_rewiring,
         act,
+        edge_embed_type,
     ):
         super().__init__()
         self.act = act
@@ -58,6 +60,7 @@ def __init__(
             "one-supernode-per-atom-type",
             "one-supernode-per-atom-type-dist",
         }
+        self.edge_embed_type = edge_embed_type
 
         # Phys embeddings
         self.phys_emb = PhysEmbedding(
@@ -97,7 +100,24 @@ def __init__(
 
         # MLP
         self.lin = Linear(hidden_channels, hidden_channels)
-        self.lin_e = Linear(num_gaussians + 3, hidden_channels)
+
+        # --- Edge embedding ---
+        if self.edge_embed_type == "":
+            self.lin_e = Linear(num_gaussians + 3, hidden_channels)
+        elif self.edge_embed_type == "rij":
+            self.lin_e = Linear(3, hidden_channels)
+        elif self.edge_embed_type == "all_rij":
+            self.lin_e = Linear(3, hidden_channels // 3)  # r_ij
+            self.lin_e2 = Linear(3, hidden_channels // 3)  # norm r_ij
+            self.lin_e3 = Linear(
+                num_gaussians, hidden_channels - 2 * (hidden_channels // 3)
+            )  # d_ij
+        elif self.edge_embed_type == "sh":
+            self.lin_e = Linear(15, hidden_channels)
+        elif self.edge_embed_type == "all":
+            self.lin_e = Linear(18, hidden_channels)
+        else:
+            raise ValueError("edge_embedding_type does not exist")
 
         self.reset_parameters()
 
@@ -114,11 +134,47 @@ def reset_parameters(self):
         self.lin.bias.data.fill_(0)
         nn.init.xavier_uniform_(self.lin_e.weight)
         self.lin_e.bias.data.fill_(0)
+        if self.edge_embed_type == "all_rij":
+            nn.init.xavier_uniform_(self.lin_e2.weight)
+            self.lin_e2.bias.data.fill_(0)
+            nn.init.xavier_uniform_(self.lin_e3.weight)
+            self.lin_e3.bias.data.fill_(0)
+
+    def forward(
+        self, z, rel_pos, edge_attr, tag=None, normalised_rel_pos=None, subnodes=None
+    ):
+
+        # --- Edge embedding --
+
+        if self.edge_embed_type == "rij":
+            e = self.lin_e(rel_pos)
+        elif self.edge_embed_type == "all_rij":
+            rel_pos = self.lin_e(rel_pos)  # r_ij
+            normalized_rel_pos = self.lin_e2(normalised_rel_pos)  # norm r_ij
+            edge_attr = self.lin_e3(edge_attr)  # d_ij
+            e = torch.cat((rel_pos, edge_attr, normalized_rel_pos), dim=1)
+        elif self.edge_embed_type == "sh":
+            self.sh = spherical_harmonics(
+                l=[1, 2, 3],
+                x=normalised_rel_pos,
+                normalize=False,
+                normalization="component",
+            )
+            e = self.lin_e(self.sh)
+        elif self.edge_embed_type == "all":
+            self.sh = spherical_harmonics(
+                l=[1, 2, 3],
+                x=normalised_rel_pos,
+                normalize=False,
+                normalization="component",
+            )
+            e = torch.cat((rel_pos, self.sh), dim=1)
+            e = self.lin_e(e)
+        else:
+            e = torch.cat((rel_pos, edge_attr), dim=1)
+            e = self.lin_e(e)
 
-    def forward(self, z, rel_pos, edge_attr, tag=None, subnodes=None):
-        # Create edge embeddings from d_ij || r_ij
-        e = torch.cat((rel_pos, edge_attr), dim=1)
-        # Extension: learn a bond feature vector and concat to above
+        # --- Atom embedding --
 
         # Create atom embeddings based on its characteristic number
         h = self.emb(z)
@@ -153,7 +209,6 @@ def forward(self, z, rel_pos, edge_attr, tag=None, subnodes=None):
 
         # Apply MLP
         h = self.lin(h)
-        e = self.lin_e(e)
 
         return h, e
 
@@ -269,6 +324,7 @@ class SfariNet(BaseModel):
         force_decoder_type (str): Type of force decoder to use.
         force_decoder_model_config (dict): Dictionary of config parameters
             for the decoder's model
+        edge_embed_type (str): type of edge_embedding
     """
 
     def __init__(self, **kwargs):
@@ -279,6 +335,7 @@ def __init__(self, **kwargs):
         self.max_num_neighbors = kwargs["max_num_neighbors"]
         self.regress_forces = kwargs["regress_forces"]
         self.energy_head = kwargs["energy_head"]
+        self.edge_embed_type = kwargs["edge_embed_type"]
 
         self.distance_expansion = GaussianSmearing(
             0.0, self.cutoff, kwargs["num_gaussians"]
@@ -297,6 +354,7 @@ def __init__(self, **kwargs):
             kwargs["phys_embeds"],
             kwargs["graph_rewiring"],
             self.act,
+            kwargs["edge_embed_type"],
         )
 
         # Interaction block
@@ -382,13 +440,14 @@ def energy_forward(self, data):
             edge_attr = self.distance_expansion(edge_weight)
 
         # Normalize and squash to [0,1] for gaussian basis
-        rel_pos_normalized = rel_pos / edge_weight.view(-1, 1)
-        rel_pos_normalized = (rel_pos_normalized + 1) / 2.0
+        rel_pos_normalized = None
+        if self.edge_embed_type in {"sh", "all_rij", "all"}:
+            rel_pos_normalized = (rel_pos / edge_weight.view(-1, 1) + 1) / 2.0
 
         pooling_loss = None  # deal with pooling loss
 
         # Embedding block
-        h, e = self.embed_block(z, rel_pos, edge_attr, data.tags)
+        h, e = self.embed_block(z, rel_pos, edge_attr, data.tags, rel_pos_normalized)
 
         # Compute atom weights for late energy head
         if self.energy_head == "weighted-av-initial-embeds":

From aaf07e6d9c4114bfcc747c65be98a8066c6a96d8 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Tue, 10 Jan 2023 08:29:53 -0500
Subject: [PATCH 048/273] test edge embed and mp type for is2re

---
 configs/exps/gnn/edge_embed_type.yaml |  65 ++++++++++++++
 configs/exps/gnn/mp_type.yaml         | 122 ++++++++++++++++++++++++++
 2 files changed, 187 insertions(+)
 create mode 100644 configs/exps/gnn/edge_embed_type.yaml
 create mode 100644 configs/exps/gnn/mp_type.yaml

diff --git a/configs/exps/gnn/edge_embed_type.yaml b/configs/exps/gnn/edge_embed_type.yaml
new file mode 100644
index 0000000000..e64fd7bc3f
--- /dev/null
+++ b/configs/exps/gnn/edge_embed_type.yaml
@@ -0,0 +1,65 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+  wandb_tags: 'edge-embed-test'
+  optim:
+    max_epochs: 5
+    batch_size: 256
+    eval_batch_size: 256
+
+runs:
+  - config: sfarinet-is2re-all
+    note: 'Sfarinet no sym'
+  - config: sfarinet-is2re-all
+    note: 'Sfarinet baseline sym'
+    frame_averaging: 2D
+    fa_frames: se3-random
+  - config: sfarinet-is2re-all
+    note: 'Sfarinet baseline sym'
+    frame_averaging: 2D
+    fa_frames: se3-random
+  - config: sfarinet-is2re-all
+    note: 'rij'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: rij
+  - config: sfarinet-is2re-all
+    note: 'sh'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: sh
+  - config: sfarinet-is2re-all
+    note: 'all rij'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all_rij
+  - config: sfarinet-is2re-all
+    note: 'all'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all
+  - config: sfarinet-is2re-all
+    note: 'all'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all
+    optim:
+      lr_initial: 0.0007
\ No newline at end of file
diff --git a/configs/exps/gnn/mp_type.yaml b/configs/exps/gnn/mp_type.yaml
new file mode 100644
index 0000000000..47d44133ba
--- /dev/null
+++ b/configs/exps/gnn/mp_type.yaml
@@ -0,0 +1,122 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+    edge_embed_type: all_rij
+  wandb_tags: 'mp-type'
+  optim:
+    max_epochs: 5
+    batch_size: 256
+    eval_batch_size: 256
+
+runs:
+  - config: fanet-is2re-all
+    note: 'fanet no sym'
+  - config: fanet-is2re-all
+    note: 'fanet baseline sym'
+    frame_averaging: 2D
+    fa_frames: se3-random
+  - config: fanet-is2re-all
+    note: 'fanet baseline sym'
+    frame_averaging: 2D
+    fa_frames: se3-random
+  - config: fanet-is2re-all
+    note: 'simple'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: simple
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+  - config: fanet-is2re-all
+    note: 'updownscale'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      lr_initial: 0.0005
+  - config: fanet-is2re-all
+    note: 'all'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: 
+    optim:
+      lr_initial: 0.0001
+    mp_type: base
+  - config: fanet-is2re-all
+    note: 'smaller lr'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale
+    optim:
+      lr_initial: 0.0005
+  - config: fanet-is2re-all
+    note: 'small warmup factor and lr'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      lr_initial: 0.0005
+      warmup_factor: 0.01
+  - config: fanet-is2re-all
+    note: 'warmup factor + lr'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model: 
+      mp_type: base
+    optim:
+      lr_initial: 0.0005
+      warmup_factor: 0.5
+  - config: fanet-is2re-all
+    note: 'big batch size'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model: 
+      mp_type: base
+    optim:
+      batch_size: 328
+      eval_batch_size: 328
+  - config: fanet-is2re-all
+    note: 'smaller batch'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model: 
+      mp_type: base
+    optim:
+      batch_size: 180
+      eval_batch_size: 180
+  - config: fanet-is2re-all
+    note: 'smaller hidden smaller lr'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+      hidden_channels: 180
+    optim: 
+      lr_initial: 0.0005
\ No newline at end of file

From b83b142b04696060c084447611370ef0a03e80aa Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Tue, 10 Jan 2023 11:03:50 -0500
Subject: [PATCH 049/273] add local env (x2), sfarinet and base_with_att GNN
 extensions

---
 ocpmodels/models/fanet.py                 |  63 +++++++---
 ocpmodels/models/utils/attention_model.py | 137 ++++++++++++++++++++++
 scripts/gnn_dev.py                        |   6 +-
 3 files changed, 187 insertions(+), 19 deletions(-)
 create mode 100644 ocpmodels/models/utils/attention_model.py

diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 37a58f61f2..2279b41317 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -1,7 +1,5 @@
 """ Code of the Scalable Frame Averaging (Rotation Invariant) GNN
 """
-import math
-
 import torch
 from e3nn.o3 import spherical_harmonics
 from torch import nn
@@ -13,6 +11,7 @@
 from ocpmodels.common.utils import conditional_grad, get_pbc_distances
 from ocpmodels.models.base_model import BaseModel
 from ocpmodels.models.force_decoder import ForceDecoder
+from ocpmodels.models.utils.attention_model import AttConv
 from ocpmodels.models.utils.pos_encodings import PositionalEncoding
 from ocpmodels.modules.phys_embeddings import PhysEmbedding
 from ocpmodels.modules.pooling import Graclus, Hierarchical_Pooling
@@ -237,21 +236,25 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
         super(InteractionBlock, self).__init__()
         self.act = act
         self.mp_type = mp_type
+        self.hidden_channels = hidden_channels
 
         if self.mp_type == "simple":
             self.lin_geom = nn.Linear(num_filters, hidden_channels)
             self.lin_h = nn.Linear(hidden_channels, hidden_channels)
 
+        elif self.mp_type == "sfarinet":
+            self.lin_h = nn.Linear(hidden_channels, hidden_channels)
+
         elif self.mp_type == "updownscale":
-            self.lin_geom = nn.Linear(num_filters + 2 * hidden_channels, num_filters)
-            # self.lin_geom = nn.Linear(num_filters, num_filters)  # like 'simple'
+            # self.lin_geom = nn.Linear(num_filters + 2 * hidden_channels, num_filters)
+            self.lin_geom = nn.Linear(num_filters, num_filters)  # like 'simple'
             self.lin_down = nn.Linear(hidden_channels, num_filters)
             self.lin_up = nn.Linear(num_filters, hidden_channels)
 
         elif self.mp_type == "base_with_att":
             # --- Compute attention coefficients if required --
-            # Change message function
-            pass
+            self.lin_h = nn.Linear(hidden_channels, hidden_channels)
+            self.lin_geom = AttConv(hidden_channels, heads=1, concat=True, bias=True)
 
         elif self.mp_type == "att":
             # --- Compute attention coefficients if required --
@@ -259,7 +262,12 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
             pass
 
         elif self.mp_type == "local_env":
-            pass
+            self.lin_geom = nn.Linear(num_filters, hidden_channels)
+            self.lin_h = nn.Linear(hidden_channels, hidden_channels)
+
+        elif self.mp_type == "up_down_local_env":
+            self.lin_h = nn.Linear(hidden_channels, num_filters)
+            self.lin_geom = nn.Linear(2 * num_filters, hidden_channels)
 
         else:  # base
             self.lin_geom = nn.Linear(
@@ -268,8 +276,9 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
             self.lin_h = nn.Linear(hidden_channels, hidden_channels)
 
     def reset_parameters(self):
-        nn.init.xavier_uniform_(self.lin_geom.weight)
-        self.lin_geom.bias.data.fill_(0)
+        if self.mp_type != "sfarinet":
+            nn.init.xavier_uniform_(self.lin_geom.weight)
+            self.lin_geom.bias.data.fill_(0)
         nn.init.xavier_uniform_(self.lin_h.weight)
         self.lin_h.bias.data.fill_(0)
         if self.mp_type == "updownscale":
@@ -280,32 +289,54 @@ def reset_parameters(self):
 
     def forward(self, h, edge_index, e):
 
-        if self.mp_type != "simple":
+        if self.mp_type in {"base"}:
             e = torch.cat([e, h[edge_index[0]], h[edge_index[1]]], dim=1)
 
         # W = self.lin_e_2(self.act(self.lin_e_1(e)))  # transform edge rep
-        W = self.lin_geom(e)
+        if self.mp_type in {"up_down_local_env", "sfarinet", "base_with_att"}:
+            W = e
+        else:
+            W = self.lin_geom(e)
 
         if self.mp_type == "updownscale":
             h = self.lin_down(h)  # downscale node rep.
             h = self.propagate(edge_index, x=h, W=W)  # propagate
             h = self.lin_up(self.act(h))  # upscale node rep.
+
         elif self.mp_type == "att":
             # Look at So3krates code
             pass
         elif self.mp_type == "base_with_att":
-            # Combine above and base
-            pass
+            h = self.lin_h(self.act(h))
+            h = self.lin_geom(h, edge_index, W)
+
         elif self.mp_type == "local_env":
-            pass
+            h = self.lin_h(self.act(h))
+            chi = self.propagate(edge_index, x=h, W=W, local_env=True)  # propagate
+            h = self.propagate(edge_index, x=h, W=W)  # propagate
+            h = h + chi
+            # h = h * chi
+        elif self.mp_type == "up_down_local_env":
+            h = self.lin_h(self.act(h))
+            chi = self.propagate(edge_index, x=h, W=W, local_env=True)  # propagate
+            h = self.propagate(edge_index, x=h, W=W)  # propagate
+            h = torch.cat((h, chi), dim=1)
+            h = self.lin_geom(h)
+
         else:  # base, simple
             h = self.lin_h(self.act(h))
             h = self.propagate(edge_index, x=h, W=W)  # propagate
 
         return h
 
-    def message(self, x_j, W):
-        return x_j * W
+    def message(self, x_j, W, local_env=None, att=None):
+        if local_env is not None:
+            return W
+        elif att is not None:
+            # Compute alpha_i
+            return alpha_i * x_j * W
+        else:
+            return x_j * W
 
 
 class OutputBlock(nn.Module):
diff --git a/ocpmodels/models/utils/attention_model.py b/ocpmodels/models/utils/attention_model.py
new file mode 100644
index 0000000000..4d10561de1
--- /dev/null
+++ b/ocpmodels/models/utils/attention_model.py
@@ -0,0 +1,137 @@
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.nn import Parameter
+from torch_geometric.nn.conv import MessagePassing
+from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size
+from torch_sparse import SparseTensor
+
+from ..inits import glorot, zeros
+
+
+class AttConv(MessagePassing):
+    r"""The graph attentional operator from the `"Graph Attention Networks"
+    <https://arxiv.org/abs/1710.10903>`_ paper
+
+    Args:
+        hidden_channels (int): Size of each input sample, or :obj:`-1` to
+            derive the size from the first input(s) to the forward method.
+            A tuple corresponds to the sizes of source and target
+            dimensionalities.
+        heads (int, optional): Number of multi-head-attentions.
+            (default: :obj:`1`)
+        concat (bool, optional): If set to :obj:`False`, the multi-head
+            attentions are averaged instead of concatenated.
+            (default: :obj:`True`)
+        bias (bool, optional): If set to :obj:`False`, the layer will not learn
+            an additive bias. (default: :obj:`True`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch_geometric.nn.conv.MessagePassing`.
+
+    Shapes:
+        - **input:**
+          node features :math:`(|\mathcal{V}|, F_{in})` or
+          :math:`((|\mathcal{V_s}|, F_{s}), (|\mathcal{V_t}|, F_{t}))`
+          if bipartite,
+          edge indices :math:`(2, |\mathcal{E}|)`,
+          edge features :math:`(|\mathcal{E}|, D)` *(optional)*
+        - **output:** node features :math:`(|\mathcal{V}|, H * F_{out})` or
+          :math:`((|\mathcal{V}_t|, H * F_{out})` if bipartite.
+          If :obj:`return_attention_weights=True`, then
+          :math:`((|\mathcal{V}|, H * F_{out}),
+          ((2, |\mathcal{E}|), (|\mathcal{E}|, H)))`
+          or :math:`((|\mathcal{V_t}|, H * F_{out}), ((2, |\mathcal{E}|),
+          (|\mathcal{E}|, H)))` if bipartite
+    """
+
+    def __init__(
+        self,
+        hidden_channels: int,
+        heads: int = 1,
+        concat: bool = True,
+        bias: bool = True,
+        **kwargs,
+    ):
+        kwargs.setdefault("aggr", "add")
+        super().__init__(node_dim=0, **kwargs)
+
+        self.hidden_channels = hidden_channels
+        self.heads = heads
+        self.concat = concat
+
+        # The learnable parameters to compute attention coefficients:
+        self.att_src = Parameter(torch.Tensor(1, heads, hidden_channels))
+        self.att_dst = Parameter(torch.Tensor(1, heads, hidden_channels))
+
+        if bias and concat:
+            self.bias = Parameter(torch.Tensor(heads * hidden_channels))
+        elif bias and not concat:
+            self.bias = Parameter(torch.Tensor(hidden_channels))
+        else:
+            self.register_parameter("bias", None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        glorot(self.att_src)
+        glorot(self.att_dst)
+        zeros(self.bias)
+
+    def forward(
+        self,
+        x: Union[Tensor, OptPairTensor],
+        edge_index: Adj,
+        edge_attr: OptTensor = None,
+        size: Size = None,
+        return_attention_weights=None,
+    ):
+        r"""
+        Args:
+            return_attention_weights (bool, optional): If set to :obj:`True`,
+                will additionally return the tuple
+                :obj:`(edge_index, attention_weights)`, holding the computed
+                attention weights for each edge. (default: :obj:`None`)
+        """
+        # NOTE: attention weights will be returned whenever
+        # `return_attention_weights` is set to a value, regardless of its
+        # actual value (might be `True` or `False`).
+
+        x.view(-1, self.heads, self.hidden_channels)
+
+        # Next, we compute node-level attention coefficients, both for source
+        # and target nodes (if present):
+        alpha = (x * self.att_src).sum(dim=-1)
+
+        # edge_updater_type: (alpha: OptPairTensor, edge_attr: OptTensor)
+        alpha = self.edge_updater(edge_index, alpha=alpha, edge_attr=edge_attr)
+
+        # propagate_type: (x: OptPairTensor, alpha: Tensor)
+        out = self.propagate(
+            edge_index, x=x, alpha=alpha, edge_attr=edge_attr, size=size
+        )
+
+        if self.concat:
+            out = out.view(-1, self.heads * self.hidden_channels)
+        else:
+            out = out.mean(dim=1)
+
+        if self.bias is not None:
+            out = out + self.bias
+
+        if isinstance(return_attention_weights, bool):
+            if isinstance(edge_index, Tensor):
+                return out, (edge_index, alpha)
+            elif isinstance(edge_index, SparseTensor):
+                return out, edge_index.set_value(alpha, layout="coo")
+        else:
+            return out
+
+    def message(self, x_j: Tensor, alpha: Tensor, edge_attr: Tensor) -> Tensor:
+        return alpha.unsqueeze(-1) * x_j * edge_attr
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}({self.in_channels}, "
+            f"{self.hidden_channels}, heads={self.heads})"
+        )
diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py
index d2e114c655..40dc41a18f 100644
--- a/scripts/gnn_dev.py
+++ b/scripts/gnn_dev.py
@@ -24,7 +24,7 @@
     config["optim"] = {"max_epochs": 0}
     config["model"] = {"use_pbc": True}
     config["model"]["edge_embed_type"] = "rij"
-    config["model"]["mp_type"] = "base"
+    # config["model"]["mp_type"] = "base"
 
     checkpoint_path = None
     # "checkpoints/2022-04-28-11-42-56-dimenetplusplus/" + "best_checkpoint.pt"
@@ -32,8 +32,8 @@
     str_args = sys.argv[1:]
     if all("config" not in arg for arg in str_args):
         str_args.append("--is_debug")
-        # str_args.append("--config=fanet-is2re-10k")
-        str_args.append("--config=sfarinet-s2ef-2M")
+        str_args.append("--config=sfarinet-is2re-10k")
+        # str_args.append("--config=sfarinet-s2ef-2M")
         warnings.warn(
             "No model / mode is given; chosen as default" + f"Using: {str_args[-1]}"
         )

From 538c9f7262a904d07b3f1c5f442e833fa9e7d938 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 11:14:25 -0500
Subject: [PATCH 050/273] tags a tuple

---
 ocpmodels/common/logger.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index 1e329d59f0..8f47c038ae 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -149,6 +149,7 @@ def mark_preempting(self):
     def add_tags(self, tags):
         if not isinstance(tags, list):
             tags = [tags]
+        tags = tuple(tags)
         self.run.tags = self.run.tags + tags
 
     def collect_output_files(self, policy="now"):

From d2f3e3f7345cbaa5914222dfcaac1253b96a0888 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 12:02:21 -0500
Subject: [PATCH 051/273] fix `observe` signature

---
 main.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 03c4401b38..b4add00fb6 100644
--- a/main.py
+++ b/main.py
@@ -119,7 +119,10 @@ def run(self, orion_exp=None):
         # print("objective post-broadcast: ", objective)
 
         if orion_exp is not None:
-            orion_exp.observe(orion_trial, objective, name="energy_mae")
+            orion_exp.observe(
+                orion_trial,
+                {"type": "objective", "name": "energy_mae", "value": objective},
+            )
 
 
 if __name__ == "__main__":

From 02e2d5602de4aeca29b32dcac7640f8189889928 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Tue, 10 Jan 2023 13:02:14 -0500
Subject: [PATCH 052/273] two attention mechanisms

---
 ocpmodels/models/fanet.py                 | 27 +++++++++++++----------
 ocpmodels/models/utils/attention_model.py | 17 +++++++++-----
 ocpmodels/trainers/base_trainer.py        |  7 ++++--
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 2279b41317..6d893b2a26 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -4,7 +4,7 @@
 from e3nn.o3 import spherical_harmonics
 from torch import nn
 from torch.nn import Embedding, Linear
-from torch_geometric.nn import MessagePassing, radius_graph
+from torch_geometric.nn import MessagePassing, TransformerConv, radius_graph
 from torch_scatter import scatter
 
 from ocpmodels.common.registry import registry
@@ -258,8 +258,15 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
 
         elif self.mp_type == "att":
             # --- Compute attention coefficients if required --
-            # Change message function
-            pass
+            self.lin_h = nn.Linear(hidden_channels, hidden_channels)
+            self.lin_geom = TransformerConv(
+                hidden_channels,
+                hidden_channels,
+                heads=1,
+                concat=True,
+                root_weight=False,
+                edge_dim=num_filters,
+            )
 
         elif self.mp_type == "local_env":
             self.lin_geom = nn.Linear(num_filters, hidden_channels)
@@ -293,7 +300,7 @@ def forward(self, h, edge_index, e):
             e = torch.cat([e, h[edge_index[0]], h[edge_index[1]]], dim=1)
 
         # W = self.lin_e_2(self.act(self.lin_e_1(e)))  # transform edge rep
-        if self.mp_type in {"up_down_local_env", "sfarinet", "base_with_att"}:
+        if self.mp_type in {"up_down_local_env", "sfarinet", "base_with_att", "att"}:
             W = e
         else:
             W = self.lin_geom(e)
@@ -304,12 +311,11 @@ def forward(self, h, edge_index, e):
             h = self.lin_up(self.act(h))  # upscale node rep.
 
         elif self.mp_type == "att":
-            # Look at So3krates code
-            pass
+            h = self.lin_h(self.act(h))
+            h = self.lin_geom(h, edge_index, edge_attr=W)
         elif self.mp_type == "base_with_att":
             h = self.lin_h(self.act(h))
-            h = self.lin_geom(h, edge_index, W)
-
+            h = self.lin_geom(h, edge_index, W)  # propagate is inside
         elif self.mp_type == "local_env":
             h = self.lin_h(self.act(h))
             chi = self.propagate(edge_index, x=h, W=W, local_env=True)  # propagate
@@ -329,12 +335,9 @@ def forward(self, h, edge_index, e):
 
         return h
 
-    def message(self, x_j, W, local_env=None, att=None):
+    def message(self, x_j, W, local_env=None):
         if local_env is not None:
             return W
-        elif att is not None:
-            # Compute alpha_i
-            return alpha_i * x_j * W
         else:
             return x_j * W
 
diff --git a/ocpmodels/models/utils/attention_model.py b/ocpmodels/models/utils/attention_model.py
index 4d10561de1..b280680f51 100644
--- a/ocpmodels/models/utils/attention_model.py
+++ b/ocpmodels/models/utils/attention_model.py
@@ -1,14 +1,15 @@
-from typing import Optional, Tuple, Union
+from typing import Optional, Union
 
 import torch
+import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 from torch_geometric.nn.conv import MessagePassing
+from torch_geometric.nn.inits import glorot, zeros
 from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size
+from torch_geometric.utils import softmax
 from torch_sparse import SparseTensor
 
-from ..inits import glorot, zeros
-
 
 class AttConv(MessagePassing):
     r"""The graph attentional operator from the `"Graph Attention Networks"
@@ -103,12 +104,11 @@ def forward(
         # and target nodes (if present):
         alpha = (x * self.att_src).sum(dim=-1)
 
-        # edge_updater_type: (alpha: OptPairTensor, edge_attr: OptTensor)
-        alpha = self.edge_updater(edge_index, alpha=alpha, edge_attr=edge_attr)
+        alpha = self.edge_updater(edge_index, alpha=(alpha, None))
 
         # propagate_type: (x: OptPairTensor, alpha: Tensor)
         out = self.propagate(
-            edge_index, x=x, alpha=alpha, edge_attr=edge_attr, size=size
+            edge_index, x=x, alpha=alpha, size=size, edge_attr=edge_attr
         )
 
         if self.concat:
@@ -127,6 +127,11 @@ def forward(
         else:
             return out
 
+    def edge_update(self, alpha_j: Tensor, alpha_i: OptTensor, index: Tensor) -> Tensor:
+        alpha_j = F.leaky_relu(alpha_j)
+        alpha_j = softmax(alpha_j, index)
+        return alpha_j
+
     def message(self, x_j: Tensor, alpha: Tensor, edge_attr: Tensor) -> Tensor:
         return alpha.unsqueeze(-1) * x_j * edge_attr
 
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 72bfea076d..7d3f3d0c86 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -44,7 +44,7 @@
 )
 from ocpmodels.modules.loss import DDPLoss, L2MAELoss
 from ocpmodels.modules.normalizer import Normalizer
-from ocpmodels.modules.scheduler import LRScheduler, EarlyStopper
+from ocpmodels.modules.scheduler import EarlyStopper, LRScheduler
 
 
 @registry.register_trainer("base")
@@ -75,7 +75,10 @@ def __init__(self, **kwargs):
         self.test_ri = self.config["test_ri"]
         self.is_debug = self.config["is_debug"]
         self.is_hpo = self.config["is_hpo"]
-        self.eval_on_test = self.config["eval_on_test"]
+        if self.task_name == "qm9":
+            self.eval_on_test = self.config["eval_on_test"]
+        else:
+            self.eval_on_test = False
         self.silent = self.config["silent"]
         self.datasets = {}
         self.samplers = {}

From 449fe5a1940f957021dfeb63c514f91e0be37257 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 14:15:40 -0500
Subject: [PATCH 053/273] orion trial result is a list

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index b4add00fb6..d4ed02d42d 100644
--- a/main.py
+++ b/main.py
@@ -121,7 +121,7 @@ def run(self, orion_exp=None):
         if orion_exp is not None:
             orion_exp.observe(
                 orion_trial,
-                {"type": "objective", "name": "energy_mae", "value": objective},
+                [{"type": "objective", "name": "energy_mae", "value": objective}],
             )
 
 
From 85ea173f249f1715b1f8fdef91bb459427c6aff0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 17:22:21 -0500
Subject: [PATCH 054/273] resume wandb if resume from Orion

---
 main.py                    |  2 +-
 ocpmodels/common/logger.py | 32 +++++++++++++++++++-------------
 ocpmodels/common/utils.py  | 21 ++++++++++++++-------
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/main.py b/main.py
index d4ed02d42d..a98b1dcce6 100644
--- a/main.py
+++ b/main.py
@@ -87,7 +87,7 @@ def run(self, orion_exp=None):
         # print("hparams post-broadcast: ", hparams)
         assert should_be_0 == 0
         if self.hparams:
-            print("Received hyper-parameters from Orion:")
+            print("\n💎 Received hyper-parameters from Orion:")
             print(self.hparams)
 
         self.trainer_config = merge_dicts(self.trainer_config, self.hparams)
diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index 8f47c038ae..dab33affcc 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -90,27 +90,33 @@ def mark_preempting(self):
 class WandBLogger(Logger):
     def __init__(self, trainer_config):
         super().__init__(trainer_config)
+        wandb_tags = note = name = None
 
-        wandb_id = str(self.trainer_config.get("wandb_id", ""))
-        if wandb_id:
-            wandb_id += " - "
-        slurm_jobid = os.environ.get("SLURM_JOB_ID")
-        if slurm_jobid:
-            wandb_id += f"{slurm_jobid}-"
-        wandb_id += self.trainer_config["config"]
-
-        wandb_tags = trainer_config.get("wandb_tags", "")
-        if wandb_tags:
-            wandb_tags = [t.strip() for t in wandb_tags[:63].split(",")]
+        if trainer_config.get("wandb_resume_id"):
+            wandb_id = trainer_config["wandb_resume_id"]
+        else:
+            wandb_id = str(self.trainer_config.get("wandb_id", ""))
+            if wandb_id:
+                wandb_id += " - "
+            slurm_jobid = os.environ.get("SLURM_JOB_ID")
+            if slurm_jobid:
+                wandb_id += f"{slurm_jobid}-"
+            wandb_id += self.trainer_config["config"]
+
+            wandb_tags = trainer_config.get("wandb_tags", "")
+            if wandb_tags:
+                wandb_tags = [t.strip() for t in wandb_tags[:63].split(",")]
+            note = self.trainer_config.get("note", "")
+            name = self.trainer_config["wandb_name"] or wandb_id
 
         self.run = wandb.init(
             config=self.trainer_config,
             id=wandb_id,
-            name=self.trainer_config["wandb_name"] or wandb_id,
+            name=name,
             dir=self.trainer_config["logs_dir"],
             project=self.trainer_config["wandb_project"],
             resume="allow",
-            notes=self.trainer_config.get("note", ""),
+            notes=note,
             tags=wandb_tags,
             entity="mila-ocp",
         )
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 35a63ceead..1de5556b1d 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -82,18 +82,25 @@ def continue_orion_exp(trainer_config):
     if not latest_dirs:
         return trainer_config
 
-    latest_ckpts = sorted(
-        [f for f in (latest_dirs[-1] / "checkpoints").glob("checkpoint-*")],
+    resume_dir = latest_dirs[-1]
+
+    resume_ckpts = sorted(
+        [f for f in (resume_dir / "checkpoints").glob("checkpoint-*")],
         key=lambda f: float(f.stem.split("-")[-1]),
     )
 
-    if not latest_ckpts:
-        raise ValueError(f"No checkpoint found in {str(latest_dirs[-1])}")
-    trainer_config["checkpoint"] = str(latest_ckpts[-1])
+    if not resume_ckpts:
+        raise ValueError(f"No checkpoint found in {str(resume_dir)}")
+    trainer_config["checkpoint"] = str(resume_ckpts[-1])
+    resume_url = (resume_dir / "wandb_url.txt").read_text()
+    trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1]
+
     print(
-        f"\nFound {len(latest_ckpts)} existing Orion runs.",
+        f"\n🎁 Found {len(resume_ckpts)} existing Orion runs.",
         "Resuming from latest:",
-        str(latest_dirs[-1]),
+        str(resume_dir),
+        "\nOn wandb run:",
+        resume_url,
     )
     print("Based on unique file id:", id_file)
     print("Continuing from checkpoint:", trainer_config["checkpoint"], end="\n\n")

From fc493fc636fc4bfad45ac1e9aa9cdd587ceefb50 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 17:25:43 -0500
Subject: [PATCH 055/273] scheduler selection more robust

---
 ocpmodels/modules/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index 0d993b3925..7c5c01ac1a 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -26,7 +26,7 @@ def __init__(self, optimizer, optim_config):
         self.optimizer = optimizer
         self.optim_config = optim_config.copy()
         self.warmup_scheduler = None
-        if "scheduler" in self.optim_config:
+        if self.optim_config.get("scheduler"):
             self.scheduler_type = self.optim_config["scheduler"]
         else:
             self.scheduler_type = "LambdaLR"

From 97334932ff240bf961fda099902fa2397bca4d40 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 18:39:35 -0500
Subject: [PATCH 056/273] improve display

---
 main.py                              |  6 +++---
 ocpmodels/trainers/base_trainer.py   | 12 +++++++-----
 ocpmodels/trainers/single_trainer.py |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/main.py b/main.py
index a98b1dcce6..9570e809c2 100644
--- a/main.py
+++ b/main.py
@@ -15,7 +15,7 @@
 
 import torch
 from orion.client import build_experiment
-from yaml import safe_load
+from yaml import safe_load, dump
 
 from ocpmodels.common import distutils
 from ocpmodels.common.flags import flags
@@ -87,8 +87,8 @@ def run(self, orion_exp=None):
         # print("hparams post-broadcast: ", hparams)
         assert should_be_0 == 0
         if self.hparams:
-            print("\n💎 Received hyper-parameters from Orion:")
-            print(self.hparams)
+            print("\n💎💎Received hyper-parameters from Orion:")
+            print(dump(self.hparams), end="\n💎💎\n")
 
         self.trainer_config = merge_dicts(self.trainer_config, self.hparams)
         self.trainer_config = continue_orion_exp(self.trainer_config)
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 7d3f3d0c86..e410d3d885 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -139,7 +139,8 @@ def __init__(self, **kwargs):
             self.hpo_checkpoint_every = self.config["optim"].get("checkpoint_every", -1)
 
         if distutils.is_master() and not self.silent:
-            print(yaml.dump(self.config, default_flow_style=False))
+            print("🧰 Trainer config:")
+            print(yaml.dump(self.config), end="\n\n")
         self.load()
 
         self.evaluator = Evaluator(
@@ -287,7 +288,8 @@ def load_model(self):
         # Build model
         if distutils.is_master() and not self.silent:
             logging.info(
-                f"Loading model {self.config['model_name']}: {self.config['model']}"
+                f"Loading model {self.config['model_name']}:"
+                + f" {yaml.dump(self.config['model'])}"
             )
 
         bond_feat_dim = None
@@ -314,8 +316,8 @@ def load_model(self):
                 f"{self.model.num_params} parameters."
             )
 
-        if self.logger is not None:
-            self.logger.watch(self.model)
+        # if self.logger is not None:
+        #     self.logger.watch(self.model)
 
         self.model = OCPDataParallel(
             self.model,
@@ -543,7 +545,7 @@ def validate(
     ):
         if distutils.is_master() and not self.silent:
             print()
-            logging.info(f"Evaluating on {split}.")
+            logging.info(f"🧐 Evaluating on {split}.")
         if self.is_hpo:
             disable_tqdm = True
 
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index ec30642ca4..adf460f72f 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -210,7 +210,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         model_run_time = 0
 
         if not self.silent:
-            print("---Beginning of Training---")
+            print(f"--- 🔄 Beginning of Training @ {self.now}---")
 
         for epoch_int in range(start_epoch, self.config["optim"]["max_epochs"]):
 

From 70c7efdb7188e5a4f161fdc3a8878090476d2cf9 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 18:42:10 -0500
Subject: [PATCH 057/273] it's ok not to find checkpoints

---
 ocpmodels/common/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 1de5556b1d..95fca5b602 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -90,7 +90,9 @@ def continue_orion_exp(trainer_config):
     )
 
     if not resume_ckpts:
-        raise ValueError(f"No checkpoint found in {str(resume_dir)}")
+        print(f"🥶 Warning: No checkpoint found in {str(resume_dir)}. Not resuming.")
+        return trainer_config
+
     trainer_config["checkpoint"] = str(resume_ckpts[-1])
     resume_url = (resume_dir / "wandb_url.txt").read_text()
     trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1]

From 2974e7d0828c9fe7f87e5a4d5630dfd3182bafd9 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 10 Jan 2023 18:50:11 -0500
Subject: [PATCH 058/273] handle keyboard interrupt

---
 launch_exp.py | 35 ++++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index 34999ce607..c7109bfdef 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -143,6 +143,7 @@ def cli_arg(args, key=""):
 
 
 if __name__ == "__main__":
+    is_interrupted = False
     args = resolved_args()
     assert "exp" in args
     regex = args.get("match", ".*")
@@ -215,10 +216,13 @@ def cli_arg(args, key=""):
     confirm = input("\n🚦 Confirm? [y/n]")
 
     if confirm == "y":
-        outputs = [
-            print(f"Launching job {c:3}", end="\r") or os.popen(command).read().strip()
-            for c, command in enumerate(commands)
-        ]
+        try:
+            outputs = []
+            for c, command in enumerate(commands):
+                print(f"Launching job {c:3}", end="\r")
+                outputs.append(os.popen(command).read().strip())
+        except KeyboardInterrupt:
+            is_interrupted = True
         outdir = ROOT / "data" / "exp_outputs" / exp_name
         outfile = outdir / f"{exp_name.split('/')[-1]}_{ts}.txt"
         outfile.parent.mkdir(exist_ok=True, parents=True)
@@ -228,14 +232,19 @@ def cli_arg(args, key=""):
             for line in text.splitlines()
             if (sep := "Submitted batch job ") in line
         ]
-        text += f"{separator}All jobs launched: {' '.join(jobs)}"
-        with outfile.open("w") as f:
-            f.write(text)
-        print(f"Output written to {str(outfile)}")
-        print(util_strings(jobs))
-        yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs)
-        print(
-            "Experiment summary YAML in ", f"./{str(yml_out.relative_to(Path.cwd()))}"
-        )
+
+        if is_interrupted:
+            print("\n💀 Interrupted. Kill jobs with:\n$ scancel" + " ".join(jobs))
+        else:
+            text += f"{separator}All jobs launched: {' '.join(jobs)}"
+            with outfile.open("w") as f:
+                f.write(text)
+            print(f"Output written to {str(outfile)}")
+            print(util_strings(jobs))
+            yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs)
+            print(
+                "Experiment summary YAML in ",
+                f"./{str(yml_out.relative_to(Path.cwd()))}",
+            )
     else:
         print("Aborting")

From 97018bb8f502a6d3b5a769edb2ca406baad2054e Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 11 Jan 2023 16:43:40 -0500
Subject: [PATCH 059/273] working orion implementation

---
 configs/exps/debug/orion.yaml        | 37 ++++++++++++++++------------
 launch_exp.py                        | 37 +++++++++++++++++++---------
 main.py                              | 33 +++++++------------------
 ocpmodels/common/flags.py            |  2 +-
 ocpmodels/common/utils.py            | 34 +++++++++++++++++++++++++
 ocpmodels/trainers/base_trainer.py   |  6 ++---
 ocpmodels/trainers/single_trainer.py |  6 ++++-
 7 files changed, 98 insertions(+), 57 deletions(-)

diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml
index 4a02537aa9..93eb5f5de4 100644
--- a/configs/exps/debug/orion.yaml
+++ b/configs/exps/debug/orion.yaml
@@ -1,8 +1,8 @@
 # more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
 job:
-  mem: 32GB
+  mem: 24GB
   cpus: 4
-  gres: gpu:1
+  gres: gpu:16gb:1
   time: 1:00:00
   partition: main
   code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
@@ -32,7 +32,6 @@ default:
     decay_rate: 0.05 # at the end of training, lr is decay_rate*lr_initial
     # max_epochs = ref_steps[3e6] / (n_train[110 000] / ref_batch_size[32])
     max_epochs: -1
-    max_steps: 3000000
   note:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels
     optim: batch_size, lr_initial
@@ -40,16 +39,22 @@ default:
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
-  _meta_:
-    n_runs: 2
-    unique_exp_name: ocp-qm9-orion-debug-v0.0.2
-  optim:
-    batch_size: uniform(32, 1024, discrete=True)
-    lr_initial: loguniform(1e-5, 5e-3, precision=2)
-    max_steps: fidelity(1e4, 1e6, base=5e5)
-  model:
-    num_gaussians: uniform(16, 200, discrete=True)
-    hidden_channels: uniform(32, 512, discrete=True)
-    num_filters: uniform(32, 512, discrete=True)
-    num_interactions: uniform(1, 7, discrete=True)
-    phys_embeds: choices([True, False])
\ No newline at end of file
+  n_jobs: 20
+
+  unique_exp_name: ocp-qm9-orion-debug-v1.0.0
+
+  space:
+    optim/max_steps: fidelity(1e5, 1e6, base=3)
+    optim/batch_size: uniform(32, 128, discrete=True)
+    optim/lr_initial: loguniform(1e-5, 5e-3, precision=2)
+    model/num_gaussians: uniform(16, 200, discrete=True)
+    model/hidden_channels: uniform(32, 512, discrete=True)
+    model/num_filters: uniform(32, 512, discrete=True)
+    model/num_interactions: uniform(1, 7, discrete=True)
+    model/phys_embeds: choices([True, False])
+
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 5
+      num_brackets: 1
diff --git a/launch_exp.py b/launch_exp.py
index c7109bfdef..e40199df7a 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -142,6 +142,22 @@ def cli_arg(args, key=""):
     return s
 
 
+def get_args_or_exp(key, args, exp):
+    value = None
+    if key in args:
+        if key in exp:
+            print(f"Overriding orion.{key} from the command-line")
+        value = args[key]
+    elif key in exp:
+        value = exp[key]
+    else:
+        raise ValueError(
+            f"Must specify 'orion.{key}' "
+            + f"in exp file or from the command-line `{key}=value`"
+        )
+    return value
+
+
 if __name__ == "__main__":
     is_interrupted = False
     args = resolved_args()
@@ -157,24 +173,21 @@ def cli_arg(args, key=""):
     if "orion" in exp:
         orion_base = ROOT / "data" / "orion"
         assert "runs" not in exp, "Cannot use both Orion and runs"
-        meta = exp["orion"].pop("_meta_", {})
-        assert (
-            "unique_exp_name" in meta
-        ), "Must specify 'orion._meta_.unique_exp_name' in exp file"
-        assert "n_runs" in meta, "Must specify 'orion._meta_.n_runs' in exp file"
-
-        search_path = (
-            orion_base / "search-spaces" / f"{ts}-{meta['unique_exp_name']}.yaml"
-        )
+
+        n_jobs = get_args_or_exp("n_jobs", args, exp["orion"])
+        unique_exp_name = get_args_or_exp("unique_exp_name", args, exp["orion"])
+        if "unique_exp_name" not in exp:
+            exp["unique_exp_name"] = unique_exp_name
+
+        search_path = orion_base / "search-spaces" / f"{ts}-{unique_exp_name}.yaml"
         search_path.parent.mkdir(exist_ok=True, parents=True)
         assert not search_path.exists()
         search_path.write_text(dump(exp["orion"]))
         runs = [
             {
-                "orion_search_path": str(search_path),
-                "orion_unique_exp_name": meta["unique_exp_name"],
+                "orion_exp_config_path": str(search_path),
             }
-            for _ in range(meta["n_runs"])
+            for _ in range(n_jobs)
         ]
     else:
         runs = exp["runs"]
diff --git a/main.py b/main.py
index 9570e809c2..5d2558beb1 100644
--- a/main.py
+++ b/main.py
@@ -11,27 +11,26 @@
 import time
 import traceback
 import warnings
-from pathlib import Path
 
 import torch
-from orion.client import build_experiment
-from yaml import safe_load, dump
+from yaml import dump
 
 from ocpmodels.common import distutils
 from ocpmodels.common.flags import flags
 from ocpmodels.common.registry import registry
 from ocpmodels.common.utils import (
     JOB_ID,
-    ROOT,
     build_config,
     continue_from_slurm_job_id,
     continue_orion_exp,
+    load_orion_exp,
     merge_dicts,
     move_lmdb_data_to_slurm_tmpdir,
     read_slurm_env,
     resolve,
     setup_imports,
     setup_logging,
+    unflatten_dict,
     update_from_sbatch_py_vars,
 )
 from ocpmodels.trainers import BaseTrainer
@@ -76,7 +75,7 @@ def run(self, orion_exp=None):
         if distutils.is_master():
             if orion_exp:
                 orion_trial = orion_exp.suggest(1)
-                self.hparams = orion_trial.params
+                self.hparams = unflatten_dict(orion_trial.params, sep="/")
                 self.hparams["orion_hash_params"] = orion_trial.hash_params
 
         should_be_0 = distutils.get_rank()
@@ -87,8 +86,8 @@ def run(self, orion_exp=None):
         # print("hparams post-broadcast: ", hparams)
         assert should_be_0 == 0
         if self.hparams:
-            print("\n💎💎Received hyper-parameters from Orion:")
-            print(dump(self.hparams), end="\n💎💎\n")
+            print("\n💎 Received hyper-parameters from Orion:")
+            print(dump(self.hparams), end="\n")
 
         self.trainer_config = merge_dicts(self.trainer_config, self.hparams)
         self.trainer_config = continue_orion_exp(self.trainer_config)
@@ -169,23 +168,9 @@ def run(self, orion_exp=None):
         # -------------------
         # -----  Train  -----
         # -------------------
-        if args.orion_search_path and distutils.is_master():
-            assert args.orion_unique_exp_name
-            space = safe_load(Path(args.orion_search_path).read_text())
-            print("Search Space: ", space)
-            experiment = build_experiment(
-                storage={
-                    "database": {
-                        "host": str(
-                            ROOT / "data" / "orion" / "storage" / "orion_db.pkl"
-                        ),
-                        "type": "pickleddb",
-                    }
-                },
-                name=args.orion_unique_exp_name,
-                space=space,
-                algorithms={"asha": {"seed": 123}},
-            )
+        if args.orion_exp_config_path and distutils.is_master():
+            experiment = load_orion_exp(args)
+            print("\nStarting runner.")
             runner.run(orion_exp=experiment)
         else:
             print("Starting runner.")
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index e1d19fbb5f..30024baa19 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -240,7 +240,7 @@ def add_core_args(self):
             help="Log training loss every n steps",
         )
         self.parser.add_argument(
-            "--orion_search_path",
+            "--orion_exp_config_path",
             "-o",
             type=str,
             help="Path to an orion search space yaml file",
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 95fca5b602..0baaeec890 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -30,6 +30,7 @@
 import yaml
 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
 from matplotlib.figure import Figure
+from orion.client import build_experiment
 from torch_geometric.data import Data
 from torch_geometric.utils import remove_self_loops
 from torch_scatter import segment_coo, segment_csr
@@ -43,6 +44,28 @@
 JOB_ID = os.environ.get("SLURM_JOB_ID")
 
 
+def load_orion_exp(args):
+    exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text())
+
+    assert args.orion_unique_exp_name or exp_config.get(
+        "unique_exp_name"
+    ), "Must provide orion_unique_exp_name in the command-line or the config file."
+
+    print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}")
+    experiment = build_experiment(
+        storage={
+            "database": {
+                "host": str(ROOT / "data" / "orion" / "storage" / "orion_db.pkl"),
+                "type": "pickleddb",
+            }
+        },
+        name=args.orion_unique_exp_name or exp_config["unique_exp_name"],
+        space=exp_config["space"],
+        algorithms=exp_config["algorithms"],
+    )
+    return experiment
+
+
 def continue_orion_exp(trainer_config):
     if not trainer_config.get("orion_search_path") or not trainer_config.get(
         "orion_unique_exp_name"
@@ -793,6 +816,17 @@ def create_dict_from_args(args: list, sep: str = "."):
     return return_dict
 
 
+def unflatten_dict(source, sep="."):
+    """
+    >>> d = {"a.b": 4, "a.c": 5, "r.y": 1}
+    >>> unflatten_dict(d)
+    {'a': {'b': 4, 'c': 5}, 'r': {'y': 1}}
+    """
+    target = {}
+    [dict_set_recursively(target, k.split(sep), v) for k, v in source.items()]
+    return target
+
+
 def load_config_legacy(path: str, previous_includes: list = []):
     path = Path(path)
     if path in previous_includes:
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index e410d3d885..b5a7b71001 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -286,9 +286,9 @@ def load_task(self):
 
     def load_model(self):
         # Build model
-        if distutils.is_master() and not self.silent:
-            logging.info(
-                f"Loading model {self.config['model_name']}:"
+        if not self.silent:
+            print(
+                f"🧠 Loading model {self.config['model_name']}:\n"
                 + f" {yaml.dump(self.config['model'])}"
             )
 
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index adf460f72f..d76133fee9 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -48,7 +48,7 @@ def now(self):
 
     def load_task(self):
         if not self.silent:
-            logging.info(f"Loading dataset: {self.config['task']['dataset']}")
+            print(f"Loading dataset: {self.config['task']['dataset']}")
         self.num_targets = 1
 
         # start imports from
@@ -80,6 +80,10 @@ def load_task(self):
                         device=self.device,
                     )
                 else:
+                    print(
+                        "Warning: grad_target_mean not found in normalizer but",
+                        "regress_forces and normalize_labels are true.",
+                    )
                     self.normalizers["grad_target"] = Normalizer(
                         tensor=self.datasets["train"].data.y[
                             self.datasets["train"].__indices__

From a0d5aa7312f29d0a0f1efa42caa74745e0542a40 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Wed, 11 Jan 2023 17:51:39 -0500
Subject: [PATCH 060/273] update config and fanet modif

---
 configs/exps/gnn/edge_embed_type.yaml         |  29 +---
 configs/exps/gnn/edge_embed_type_s2ef.yaml    |  65 +++++++++
 configs/exps/gnn/mp_type_2.yaml               | 133 ++++++++++++++++++
 .../exps/prop-check/symmetries_s2ef_2.yaml    |  45 +-----
 .../exps/prop-check/symmetries_s2ef_3.yaml    |  92 ++++++++++++
 configs/models/fanet.yaml                     |  19 +--
 configs/models/sfarinet.yaml                  |  17 ++-
 ocpmodels/models/fanet.py                     |  34 +++--
 scripts/gnn_dev.py                            |   6 +-
 9 files changed, 334 insertions(+), 106 deletions(-)
 create mode 100644 configs/exps/gnn/edge_embed_type_s2ef.yaml
 create mode 100644 configs/exps/gnn/mp_type_2.yaml
 create mode 100644 configs/exps/prop-check/symmetries_s2ef_3.yaml

diff --git a/configs/exps/gnn/edge_embed_type.yaml b/configs/exps/gnn/edge_embed_type.yaml
index e64fd7bc3f..b7a0418fe2 100644
--- a/configs/exps/gnn/edge_embed_type.yaml
+++ b/configs/exps/gnn/edge_embed_type.yaml
@@ -16,33 +16,16 @@ default:
     energy_head: 'weighted-av-initial-embeds' # False ?
   wandb_tags: 'edge-embed-test'
   optim:
-    max_epochs: 5
+    max_epochs: 15
     batch_size: 256
     eval_batch_size: 256
+  cp_data_to_tmpdir: true
 
 runs:
-  - config: sfarinet-is2re-all
-    note: 'Sfarinet no sym'
-  - config: sfarinet-is2re-all
-    note: 'Sfarinet baseline sym'
-    frame_averaging: 2D
-    fa_frames: se3-random
   - config: sfarinet-is2re-all
     note: 'Sfarinet baseline sym'
     frame_averaging: 2D
     fa_frames: se3-random
-  - config: sfarinet-is2re-all
-    note: 'rij'
-    frame_averaging: 2D
-    fa_frames: se3-random
-    model:
-      edge_embed_type: rij
-  - config: sfarinet-is2re-all
-    note: 'sh'
-    frame_averaging: 2D
-    fa_frames: se3-random
-    model:
-      edge_embed_type: sh
   - config: sfarinet-is2re-all
     note: 'all rij'
     frame_averaging: 2D
@@ -55,11 +38,3 @@ runs:
     fa_frames: se3-random
     model:
       edge_embed_type: all
-  - config: sfarinet-is2re-all
-    note: 'all'
-    frame_averaging: 2D
-    fa_frames: se3-random
-    model:
-      edge_embed_type: all
-    optim:
-      lr_initial: 0.0007
\ No newline at end of file
diff --git a/configs/exps/gnn/edge_embed_type_s2ef.yaml b/configs/exps/gnn/edge_embed_type_s2ef.yaml
new file mode 100644
index 0000000000..5ad120c07d
--- /dev/null
+++ b/configs/exps/gnn/edge_embed_type_s2ef.yaml
@@ -0,0 +1,65 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 30:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  cp_data_to_tmpdir: true
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+    regress_forces: direct_with_gradient_target
+  wandb_tags: 's2ef-archi-tests'
+  optim:
+    max_epochs: 5
+    batch_size: 192
+    eval_batch_size: 192
+
+runs:
+  - config: sfarinet-s2ef-2M
+    note: 'Sfarinet no sym'
+  - config: sfarinet-s2ef-2M
+    note: 'Sfarinet baseline sym'
+    frame_averaging: 2D
+    fa_frames: se3-random
+  - config: sfarinet-s2ef-2M
+    note: 'Sfarinet baseline sym'
+    frame_averaging: 2D
+    fa_frames: se3-random
+  - config: sfarinet-s2ef-2M
+    note: 'rij'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: rij
+  - config: sfarinet-s2ef-2M
+    note: 'sh'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: sh
+  - config: sfarinet-s2ef-2M
+    note: 'all rij'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all_rij
+  - config: sfarinet-s2ef-2M
+    note: 'all'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all
+  - config: sfarinet-s2ef-2M
+    note: 'all'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all
diff --git a/configs/exps/gnn/mp_type_2.yaml b/configs/exps/gnn/mp_type_2.yaml
new file mode 100644
index 0000000000..c91945a03d
--- /dev/null
+++ b/configs/exps/gnn/mp_type_2.yaml
@@ -0,0 +1,133 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+    edge_embed_type: all_rij
+  wandb_tags: 'mp-type'
+  optim:
+    max_epochs: 5
+    batch_size: 256
+    eval_batch_size: 256
+  cp_data_to_tmpdir: true
+
+runs:
+  - config: fanet-is2re-all
+    note: 'fanet baseline sym'
+    frame_averaging: 2D
+    fa_frames: se3-random
+  - config: fanet-is2re-all
+    note: 'simple'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: simple
+  - config: fanet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+  - config: fanet-is2re-all
+    note: 'att'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: att
+  - config: fanet-is2re-all
+    note: 'local_env'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: local_env
+  - config: fanet-is2re-all
+    note: 'up_down_local_env'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: up_down_local_env
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+  - config: fanet-is2re-all
+    note: 'base_with_att'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base_with_att
+  - config: fanet-is2re-all
+    note: 'updownscale'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      batch_size: 128
+      eval_batch_size: 128
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      batch_size: 64
+      eval_batch_size: 64
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      batch_size: 300
+      eval_batch_size: 300
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      warmup_factor: 0.05
+  - config: fanet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+    optim:
+      warmup_steps: 4000
+  - config: fanet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+    optim:
+      lr_gamma: 0.4
+      max_epochs: 20
+  - config: sfarinet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
diff --git a/configs/exps/prop-check/symmetries_s2ef_2.yaml b/configs/exps/prop-check/symmetries_s2ef_2.yaml
index aebe1b7934..9f6ae35be2 100644
--- a/configs/exps/prop-check/symmetries_s2ef_2.yaml
+++ b/configs/exps/prop-check/symmetries_s2ef_2.yaml
@@ -33,56 +33,15 @@ runs:
     fa_frames: all
     model:
       regress_forces: from_energy
-
-  - config: sfarinet-s2ef-2M
+  - config: sfarinet-s2ef-2M  # 2659788
     note: '2D all gradient'
     frame_averaging: 2D
     fa_frames: all
     model:
       regress_forces: direct_with_gradient_target
-  - config: sfarinet-s2ef-2M
+  - config: sfarinet-s2ef-2M  # 2659789
     note: '2d all no gradient'
     frame_averaging: 2D
     fa_frames: all
     model:
       regress_forces: direct
-
-  - config: sfarinet-s2ef-2M
-    note: 'Big energy grad coef'
-    frame_averaging: 2D
-    fa_frames: all
-    model:
-      regress_forces: direct_with_gradient_target
-    optim:
-      energy_grad_coefficient: 100
-      force_coefficient: 30
-      energy_coefficient: 1
-  - config: sfarinet-s2ef-2M
-    note: 'Big energy grad coef'
-    frame_averaging: 2D
-    fa_frames: random
-    model:
-      regress_forces: direct_with_gradient_target
-    optim:
-      energy_grad_coefficient: 100
-      force_coefficient: 30
-      energy_coefficient: 1
-  - config: sfarinet-s2ef-2M
-    note: 'No energy coef'
-    frame_averaging: 2D
-    fa_frames: random
-    model:
-      regress_forces: direct_with_gradient_target
-    optim:
-      energy_grad_coefficient: 100
-      force_coefficient: 30
-      energy_coefficient: 0
-  - config: sfarinet-s2ef-2M
-    note: 'Large force coef'
-    frame_averaging: 2D
-    fa_frames: random
-    model:
-      regress_forces: direct_with_gradient_target
-    optim:
-      force_coefficient: 75
-      energy_coefficient: 1
\ No newline at end of file
diff --git a/configs/exps/prop-check/symmetries_s2ef_3.yaml b/configs/exps/prop-check/symmetries_s2ef_3.yaml
new file mode 100644
index 0000000000..2605009336
--- /dev/null
+++ b/configs/exps/prop-check/symmetries_s2ef_3.yaml
@@ -0,0 +1,92 @@
+job:
+  mem: 48GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: False # False ?
+  optim:
+    max_epochs: 5
+    batch_size: 196
+    eval_batch_size: 196
+  wandb_tags: 's2ef-sym-prop'
+  cp_data_to_tmp_dir: True
+
+runs:
+  - config: sfarinet-s2ef-2M
+    note: 'Baseline 5 epochs 1 Gpu'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      regress_forces: from_energy
+  - config: sfarinet-s2ef-2M
+    note: 'Baseline 5 epochs 1 Gpu'
+    frame_averaging: 3D
+    fa_frames: all
+    model:
+      regress_forces: from_energy
+
+
+
+
+  - config: sfarinet-s2ef-2M
+    note: '2D all gradient'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      regress_forces: direct_with_gradient_target
+  - config: sfarinet-s2ef-2M
+    note: '2d all no gradient'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      regress_forces: direct
+
+  - config: sfarinet-s2ef-2M
+    note: 'Big energy grad coef'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      energy_grad_coefficient: 100
+      force_coefficient: 30
+      energy_coefficient: 1
+  - config: sfarinet-s2ef-2M
+    note: 'Big energy grad coef'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      energy_grad_coefficient: 100
+      force_coefficient: 30
+      energy_coefficient: 1
+  - config: sfarinet-s2ef-2M
+    note: 'No energy coef'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      energy_grad_coefficient: 100
+      force_coefficient: 30
+      energy_coefficient: 0
+  - config: sfarinet-s2ef-2M
+    note: 'Large force coef'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      regress_forces: direct_with_gradient_target
+    optim:
+      force_coefficient: 75
+      energy_coefficient: 1
diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml
index b04d7dfba7..7426140a2b 100644
--- a/configs/models/fanet.yaml
+++ b/configs/models/fanet.yaml
@@ -73,19 +73,21 @@ is2re:
       hidden_channels: 384
       num_interactions: 4
     optim:
+      batch_size: 256
+      eval_batch_size: 256
       lr_initial: 0.001
       lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma
         - 18000
         - 27000
         - 37000
-      warmup_steps: 5394
+      warmup_steps: 6000
       max_epochs: 20
 
 # ------------------
 # -----  S2EF  -----
 # ------------------
 
-# For 4 GPUs
+# For 2 GPUs
 
 s2ef:
   default:
@@ -98,14 +100,13 @@ s2ef:
       force_coefficient: 30
       energy_grad_coefficient: 10
     optim:
-      batch_size: 48
-      eval_batch_size: 48
-      warmup_steps: 25000
+      batch_size: 96
+      eval_batch_size: 96
       warmup_factor: 0.2
       lr_gamma: 0.1
       lr_initial: 0.0001
       max_epochs: 15
-      warmup_steps: 20000
+      warmup_steps: 30000
       lr_milestones:
         - 55000
         - 75000
@@ -113,7 +114,7 @@ s2ef:
 
   200k: {}
 
-  # 2 gpus
+  # 1 gpus
   2M:
     model:
       num_interactions: 5
@@ -121,8 +122,8 @@ s2ef:
       num_gaussians: 200
       num_filters: 256
     optim:
-      batch_size: 96
-      eval_batch_size: 96
+      batch_size: 192
+      eval_batch_size: 192
 
   20M: {}
 
diff --git a/configs/models/sfarinet.yaml b/configs/models/sfarinet.yaml
index 0e2d82d993..f078c948df 100644
--- a/configs/models/sfarinet.yaml
+++ b/configs/models/sfarinet.yaml
@@ -83,14 +83,14 @@ is2re:
         - 17981
         - 26972
         - 35963
-      warmup_steps: 5394
+      warmup_steps: 6000
       max_epochs: 20
 
 # ------------------
 # -----  S2EF  -----
 # ------------------
 
-# For 4 GPUs
+# For 1 GPUs
 
 s2ef:
   default:
@@ -103,18 +103,17 @@ s2ef:
       force_coefficient: 30
       energy_grad_coefficient: 10
     optim:
-      batch_size: 48
-      eval_batch_size: 48
-      warmup_steps: 25000
+      batch_size: 192
+      eval_batch_size: 192
+      warmup_steps: 30000
       warmup_factor: 0.2
       lr_gamma: 0.1
       lr_initial: 0.0002
       max_epochs: 20
-      warmup_steps: 20000
       lr_milestones:
-        - 50000
-        - 70000
-        - 90000
+        - 55000
+        - 80000
+        - 105000
 
   200k: {}
 
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 6d893b2a26..ce059fef70 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -114,15 +114,14 @@ def __init__(
         if self.edge_embed_type == "rij":
             self.lin_e1 = Linear(3, num_filters)
         elif self.edge_embed_type == "all_rij":
-            self.lin_e1 = Linear(3, num_filters // 3)  # r_ij
-            self.lin_e12 = Linear(3, num_filters // 3)  # norm r_ij
-            self.lin_e13 = Linear(
-                num_gaussians, num_filters - 2 * (num_filters // 3)
+            self.lin_e1 = Linear(3, num_filters // 2)  # r_ij
+            self.lin_e12 = Linear(
+                num_gaussians, num_filters - (num_filters // 2)
             )  # d_ij
         elif self.edge_embed_type == "sh":
             self.lin_e1 = Linear(15, num_filters)
         elif self.edge_embed_type == "all":
-            self.lin_e1 = Linear(18, num_filters)
+            self.lin_e1 = Linear(15, num_filters)
         else:
             raise ValueError("edge_embedding_type does not exist")
 
@@ -152,8 +151,6 @@ def reset_parameters(self):
         if self.edge_embed_type == "all_rij":
             nn.init.xavier_uniform_(self.lin_e12.weight)
             self.lin_e12.bias.data.fill_(0)
-            nn.init.xavier_uniform_(self.lin_e13.weight)
-            self.lin_e13.bias.data.fill_(0)
 
     def forward(
         self, z, rel_pos, edge_attr, tag=None, normalised_rel_pos=None, subnodes=None
@@ -165,9 +162,8 @@ def forward(
             e = self.lin_e1(rel_pos)
         elif self.edge_embed_type == "all_rij":
             rel_pos = self.lin_e1(rel_pos)  # r_ij
-            normalized_rel_pos = self.lin_e12(normalised_rel_pos)  # norm r_ij
-            edge_attr = self.lin_e13(edge_attr)  # d_ij
-            e = torch.cat((rel_pos, edge_attr, normalized_rel_pos), dim=1)
+            edge_attr = self.lin_e12(edge_attr)  # d_ij
+            e = torch.cat((rel_pos, edge_attr), dim=1)
         elif self.edge_embed_type == "sh":
             self.sh = spherical_harmonics(
                 l=[1, 2, 3],
@@ -187,8 +183,8 @@ def forward(
             e = self.lin_e1(e)
 
         if self.second_layer_MLP:
-            e = self.lin_e2(e)
-            # e = self.lin_e2(self.act(e))
+            # e = self.lin_e2(e)
+            e = self.lin_e2(self.act(e))
 
         # --- Node embedding --
 
@@ -254,7 +250,14 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
         elif self.mp_type == "base_with_att":
             # --- Compute attention coefficients if required --
             self.lin_h = nn.Linear(hidden_channels, hidden_channels)
-            self.lin_geom = AttConv(hidden_channels, heads=1, concat=True, bias=True)
+            # self.lin_geom = AttConv(hidden_channels, heads=1, concat=True, bias=True)
+            self.lin_geom = TransformerConv(
+                hidden_channels,
+                hidden_channels,
+                heads=1,
+                concat=True,
+                root_weight=False,
+            )
 
         elif self.mp_type == "att":
             # --- Compute attention coefficients if required --
@@ -296,7 +299,7 @@ def reset_parameters(self):
 
     def forward(self, h, edge_index, e):
 
-        if self.mp_type in {"base"}:
+        if self.mp_type == "base":
             e = torch.cat([e, h[edge_index[0]], h[edge_index[1]]], dim=1)
 
         # W = self.lin_e_2(self.act(self.lin_e_1(e)))  # transform edge rep
@@ -316,6 +319,7 @@ def forward(self, h, edge_index, e):
         elif self.mp_type == "base_with_att":
             h = self.lin_h(self.act(h))
             h = self.lin_geom(h, edge_index, W)  # propagate is inside
+
         elif self.mp_type == "local_env":
             h = self.lin_h(self.act(h))
             chi = self.propagate(edge_index, x=h, W=W, local_env=True)  # propagate
@@ -329,7 +333,7 @@ def forward(self, h, edge_index, e):
             h = torch.cat((h, chi), dim=1)
             h = self.lin_geom(h)
 
-        else:  # base, simple
+        else:  # base, simple, sfarinet
             h = self.lin_h(self.act(h))
             h = self.propagate(edge_index, x=h, W=W)  # propagate
 
diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py
index 40dc41a18f..2797921988 100644
--- a/scripts/gnn_dev.py
+++ b/scripts/gnn_dev.py
@@ -21,10 +21,10 @@
     config["frame_averaging"] = "2D"
     config["fa_frames"] = "random"  # "random"
     config["test_ri"] = True
-    config["optim"] = {"max_epochs": 0}
+    config["optim"] = {"max_epochs": 1}
     config["model"] = {"use_pbc": True}
     config["model"]["edge_embed_type"] = "rij"
-    # config["model"]["mp_type"] = "base"
+    config["model"]["mp_type"] = "att"
 
     checkpoint_path = None
     # "checkpoints/2022-04-28-11-42-56-dimenetplusplus/" + "best_checkpoint.pt"
@@ -32,7 +32,7 @@
     str_args = sys.argv[1:]
     if all("config" not in arg for arg in str_args):
         str_args.append("--is_debug")
-        str_args.append("--config=sfarinet-is2re-10k")
+        str_args.append("--config=fanet-is2re-10k")
         # str_args.append("--config=sfarinet-s2ef-2M")
         warnings.warn(
             "No model / mode is given; chosen as default" + f"Using: {str_args[-1]}"

From 35d1f9a4f9c34435bfa3ea08aae8b33083927131 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Wed, 11 Jan 2023 17:59:29 -0500
Subject: [PATCH 061/273] fix sfarinet

---
 ocpmodels/models/fanet.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index ce059fef70..6f262c1563 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -460,6 +460,8 @@ def __init__(self, **kwargs):
         self.max_num_neighbors = kwargs["max_num_neighbors"]
         self.edge_embed_type = kwargs["edge_embed_type"]
         self.skip_co = kwargs["skip_co"]
+        if kwargs["mp_type"] == 'sfarinet':
+            kwargs["num_filters"] = kwargs["hidden_channels"]
 
         self.act = (
             getattr(nn.functional, kwargs["act"]) if kwargs["act"] != "swish" else swish

From c0deb8fa94825bf041072d18f2dbbef3c19ae29a Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 11 Jan 2023 18:10:26 -0500
Subject: [PATCH 062/273] store `orion_unique_exp_name`

---
 main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/main.py b/main.py
index 5d2558beb1..7c282f303c 100644
--- a/main.py
+++ b/main.py
@@ -77,6 +77,7 @@ def run(self, orion_exp=None):
                 orion_trial = orion_exp.suggest(1)
                 self.hparams = unflatten_dict(orion_trial.params, sep="/")
                 self.hparams["orion_hash_params"] = orion_trial.hash_params
+                self.hparams["orion_unique_exp_name"] = orion_exp.name
 
         should_be_0 = distutils.get_rank()
         hp_list = [self.hparams, should_be_0]

From 3d868ec65700d4e3a655abb4bf0669a492d60154 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 11 Jan 2023 18:13:31 -0500
Subject: [PATCH 063/273] `auto_note` after orion sampling

---
 main.py                   | 2 ++
 ocpmodels/common/utils.py | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.py b/main.py
index 7c282f303c..73d75f5e15 100644
--- a/main.py
+++ b/main.py
@@ -20,6 +20,7 @@
 from ocpmodels.common.registry import registry
 from ocpmodels.common.utils import (
     JOB_ID,
+    auto_note,
     build_config,
     continue_from_slurm_job_id,
     continue_orion_exp,
@@ -92,6 +93,7 @@ def run(self, orion_exp=None):
 
         self.trainer_config = merge_dicts(self.trainer_config, self.hparams)
         self.trainer_config = continue_orion_exp(self.trainer_config)
+        self.trainer_config = auto_note(self.trainer_config)
         cls = registry.get_trainer_class(self.trainer_config["trainer"])
         self.trainer: BaseTrainer = cls(**self.trainer_config)
         task = registry.get_task_class(self.trainer_config["mode"])(self.trainer_config)
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 0baaeec890..3fd421ea68 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -941,7 +941,6 @@ def build_config(args, args_override):
     config = set_qm9_target_stats(config)
     config = set_qm7x_target_stats(config)
     config = override_narval_paths(config)
-    config = auto_note(config)
 
     if not config["no_cpus_to_workers"]:
         cpus = count_cpus()

From 49415929137514b47a134e86c27bc75748241c53 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 11 Jan 2023 18:40:56 -0500
Subject: [PATCH 064/273] fix `forces_grad_target` propagation

---
 ocpmodels/trainers/single_trainer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index d76133fee9..5c26c974fc 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -430,12 +430,13 @@ def model_forward(self, batch_list):
                         .view(-1, 3)
                     )
                     f_all.append(g_forces)
+                breakpoint()
             batch_list[0].pos = original_pos
             if self.task_name in OCP_TASKS:
                 batch_list[0].cell = original_cell
 
             # Average predictions over frames
-            preds = {"energy": sum(e_all) / len(e_all)}
+            preds["energy"] = sum(e_all) / len(e_all)
             if len(p_all) > 0 and all(y is not None for y in p_all):
                 preds["pooling_loss"] = sum(p_all) / len(p_all)
             if len(f_all) > 0 and all(y is not None for y in f_all):

From 84b7b7a3f66cb535a96551ff57f11ec0ce6c8f07 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 11 Jan 2023 18:55:35 -0500
Subject: [PATCH 065/273] trailing breakpoinut

---
 ocpmodels/trainers/single_trainer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 5c26c974fc..0c7109bc25 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -430,7 +430,6 @@ def model_forward(self, batch_list):
                         .view(-1, 3)
                     )
                     f_all.append(g_forces)
-                breakpoint()
             batch_list[0].pos = original_pos
             if self.task_name in OCP_TASKS:
                 batch_list[0].cell = original_cell

From 788bf52f3e65c1be1e8026cbda4e661ab8b48886 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 11 Jan 2023 20:05:33 -0500
Subject: [PATCH 066/273] fix `continue_orion_exp`

---
 ocpmodels/common/utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 3fd421ea68..7945a15121 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -67,16 +67,14 @@ def load_orion_exp(args):
 
 
 def continue_orion_exp(trainer_config):
-    if not trainer_config.get("orion_search_path") or not trainer_config.get(
-        "orion_unique_exp_name"
-    ):
+    if not trainer_config.get("orion_exp_config_path"):
         return trainer_config
 
     if "orion_hash_params" not in trainer_config:
         faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml"
         print(
-            "\n\nWARNING: trainer_config has 'orion_search_path' and",
-            "'orion_unique_exp_name' but no 'orion_hash_params'.",
+            "\n\nWARNING: trainer_config has 'orion_exp_config_path'",
+            "but no 'orion_hash_params'.",
             "This can lead to inconsistencies.",
             f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n",
         )

From 0fb4f50288311a8294cb87135884e57732195f90 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Thu, 12 Jan 2023 10:26:11 -0500
Subject: [PATCH 067/273] validate GNN MP type extensions

---
 configs/exps/gnn/mp_type_2.yaml           |  96 ++++++------
 configs/exps/gnn/test_activ_linear.yaml   | 128 +++++++++++++++
 ocpmodels/models/fanet.py                 |  95 ++++++-----
 ocpmodels/models/utils/attention_model.py | 183 +++++++++++++++++++++-
 4 files changed, 414 insertions(+), 88 deletions(-)
 create mode 100644 configs/exps/gnn/test_activ_linear.yaml

diff --git a/configs/exps/gnn/mp_type_2.yaml b/configs/exps/gnn/mp_type_2.yaml
index c91945a03d..f19ceceaf1 100644
--- a/configs/exps/gnn/mp_type_2.yaml
+++ b/configs/exps/gnn/mp_type_2.yaml
@@ -24,57 +24,47 @@ default:
 
 runs:
   - config: fanet-is2re-all
-    note: 'fanet baseline sym'
+    note: 'simple'
     frame_averaging: 2D
     fa_frames: se3-random
+    model:
+      second_layer_mlp: true
+      mp_type: simple
   - config: fanet-is2re-all
     note: 'simple'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
+      skip_co: true
       mp_type: simple
+
+
   - config: fanet-is2re-all
     note: 'sfarinet'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       mp_type: sfarinet
+    optim:
+      max_epochs: 20
   - config: fanet-is2re-all
-    note: 'att'
-    frame_averaging: 2D
-    fa_frames: se3-random
-    model:
-      mp_type: att
-  - config: fanet-is2re-all
-    note: 'local_env'
-    frame_averaging: 2D
-    fa_frames: se3-random
-    model:
-      mp_type: local_env
-  - config: fanet-is2re-all
-    note: 'up_down_local_env'
+    note: 'sfarinet'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
-      mp_type: up_down_local_env
+      mp_type: sfarinet
+    optim:
+      max_epochs: 20
   - config: fanet-is2re-all
     note: 'base'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       mp_type: base
-  - config: fanet-is2re-all
-    note: 'base_with_att'
-    frame_averaging: 2D
-    fa_frames: se3-random
-    model:
-      mp_type: base_with_att
-  - config: fanet-is2re-all
-    note: 'updownscale'
-    frame_averaging: 2D
-    fa_frames: se3-random
-    model:
-      mp_type: updownscale
+    optim:
+      warmup_factor: 0.5
+      lr_initial: 0.003
+      max_epochs: 10
   - config: fanet-is2re-all
     note: 'base'
     frame_averaging: 2D
@@ -82,8 +72,9 @@ runs:
     model:
       mp_type: base
     optim:
-      batch_size: 128
-      eval_batch_size: 128
+      warmup_factor: 0.1
+      lr_initial: 0.0005
+      max_epochs: 10
   - config: fanet-is2re-all
     note: 'base'
     frame_averaging: 2D
@@ -93,41 +84,52 @@ runs:
     optim:
       batch_size: 64
       eval_batch_size: 64
+      max_epochs: 10
   - config: fanet-is2re-all
-    note: 'base'
+    note: 'att'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
-      mp_type: base
+      mp_type: att
     optim:
-      batch_size: 300
-      eval_batch_size: 300
+      lr_initial: 0.0005
+
+
   - config: fanet-is2re-all
-    note: 'base'
+    note: 'local_env'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
-      mp_type: base
-    optim:
-      warmup_factor: 0.05
+      mp_type: local_env
   - config: fanet-is2re-all
-    note: 'sfarinet'
+    note: 'up_down_local_env'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
-      mp_type: sfarinet
-    optim:
-      warmup_steps: 4000
+      mp_type: up_down_local_env
   - config: fanet-is2re-all
-    note: 'sfarinet'
+    note: 'up_down_local_env 2 layer'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: up_down_local_env
+      second_layer_mlp: true
+  - config: fanet-is2re-all
+    note: 'sfarinet 2 layer'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       mp_type: sfarinet
-    optim:
-      lr_gamma: 0.4
-      max_epochs: 20
-  - config: sfarinet-is2re-all
-    note: 'sfarinet'
+      second_layer_mlp: true
+  - config: fanet-is2re-all
+    note: 'base_with_att'
     frame_averaging: 2D
     fa_frames: se3-random
+    model:
+      mp_type: base_with_att
+  - config: fanet-is2re-all
+    note: 'updownscale'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale
diff --git a/configs/exps/gnn/test_activ_linear.yaml b/configs/exps/gnn/test_activ_linear.yaml
new file mode 100644
index 0000000000..dbf890165a
--- /dev/null
+++ b/configs/exps/gnn/test_activ_linear.yaml
@@ -0,0 +1,128 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0  # shall have been 32
+    energy_head: 'weighted-av-initial-embeds' # False ?
+    edge_embed_type: all_rij
+  wandb_tags: 'mp-type'
+  optim:
+    max_epochs: 5
+    batch_size: 256
+    eval_batch_size: 256
+  cp_data_to_tmpdir: true
+
+runs:
+  - config: fanet-is2re-all
+    note: 'sfarinet reverted linear activ'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+  - config: sfarinet-is2re-all
+    note: 'base sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+
+  - config: fanet-is2re-all
+    note: 'att'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: att
+  - config: fanet-is2re-all
+    note: 'local_env'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: local_env
+  - config: fanet-is2re-all
+    note: 'up_down_local_env'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: up_down_local_env
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+  - config: fanet-is2re-all
+    note: 'base_with_att'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base_with_att
+  - config: fanet-is2re-all
+    note: 'updownscale'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      batch_size: 128
+      eval_batch_size: 128
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      batch_size: 64
+      eval_batch_size: 64
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      batch_size: 300
+      eval_batch_size: 300
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+    optim:
+      warmup_factor: 0.05
+  - config: fanet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+    optim:
+      warmup_steps: 4000
+  - config: fanet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+    optim:
+      lr_gamma: 0.4
+      max_epochs: 20
+  - config: sfarinet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 6f262c1563..5390027b97 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -11,7 +11,7 @@
 from ocpmodels.common.utils import conditional_grad, get_pbc_distances
 from ocpmodels.models.base_model import BaseModel
 from ocpmodels.models.force_decoder import ForceDecoder
-from ocpmodels.models.utils.attention_model import AttConv
+from ocpmodels.models.utils.attention_model import TransfoAttConv
 from ocpmodels.models.utils.pos_encodings import PositionalEncoding
 from ocpmodels.modules.phys_embeddings import PhysEmbedding
 from ocpmodels.modules.pooling import Graclus, Hierarchical_Pooling
@@ -182,9 +182,11 @@ def forward(
             e = torch.cat((rel_pos, self.sh), dim=1)
             e = self.lin_e1(e)
 
+        e = self.act(e)  # can comment out
+
         if self.second_layer_MLP:
             # e = self.lin_e2(e)
-            e = self.lin_e2(self.act(e))
+            e = self.act(self.lin_e2(e))
 
         # --- Node embedding --
 
@@ -220,9 +222,9 @@ def forward(
             h += h_pos
 
         # MLP
-        h = self.lin(h)
+        h = self.act(self.lin(h))
         if self.second_layer_MLP:
-            h = self.lin_2(self.act(h))
+            h = self.act(self.lin_2(h))
 
         return h, e
 
@@ -242,25 +244,27 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
             self.lin_h = nn.Linear(hidden_channels, hidden_channels)
 
         elif self.mp_type == "updownscale":
-            # self.lin_geom = nn.Linear(num_filters + 2 * hidden_channels, num_filters)
             self.lin_geom = nn.Linear(num_filters, num_filters)  # like 'simple'
             self.lin_down = nn.Linear(hidden_channels, num_filters)
             self.lin_up = nn.Linear(num_filters, hidden_channels)
 
+        elif self.mp_type == "updownscale_base":
+            self.lin_geom = nn.Linear(num_filters + 2 * hidden_channels, num_filters)
+            self.lin_down = nn.Linear(hidden_channels, num_filters)
+            self.lin_up = nn.Linear(num_filters, hidden_channels)
+
         elif self.mp_type == "base_with_att":
-            # --- Compute attention coefficients if required --
             self.lin_h = nn.Linear(hidden_channels, hidden_channels)
             # self.lin_geom = AttConv(hidden_channels, heads=1, concat=True, bias=True)
-            self.lin_geom = TransformerConv(
+            self.lin_geom = TransfoAttConv(
                 hidden_channels,
                 hidden_channels,
                 heads=1,
                 concat=True,
                 root_weight=False,
+                edge_dim=num_filters,
             )
-
         elif self.mp_type == "att":
-            # --- Compute attention coefficients if required --
             self.lin_h = nn.Linear(hidden_channels, hidden_channels)
             self.lin_geom = TransformerConv(
                 hidden_channels,
@@ -275,9 +279,10 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
             self.lin_geom = nn.Linear(num_filters, hidden_channels)
             self.lin_h = nn.Linear(hidden_channels, hidden_channels)
 
-        elif self.mp_type == "up_down_local_env":
-            self.lin_h = nn.Linear(hidden_channels, num_filters)
-            self.lin_geom = nn.Linear(2 * num_filters, hidden_channels)
+        elif self.mp_type == "updown_local_env":
+            self.lin_down = nn.Linear(hidden_channels, num_filters)
+            self.lin_geom = nn.Linear(num_filters, num_filters)
+            self.lin_up = nn.Linear(2 * num_filters, hidden_channels)
 
         else:  # base
             self.lin_geom = nn.Linear(
@@ -299,43 +304,54 @@ def reset_parameters(self):
 
     def forward(self, h, edge_index, e):
 
-        if self.mp_type == "base":
+        # Define edge embedding
+        if self.mp_type in {"base", "updownscale_base"}:
             e = torch.cat([e, h[edge_index[0]], h[edge_index[1]]], dim=1)
 
-        # W = self.lin_e_2(self.act(self.lin_e_1(e)))  # transform edge rep
-        if self.mp_type in {"up_down_local_env", "sfarinet", "base_with_att", "att"}:
-            W = e
-        else:
-            W = self.lin_geom(e)
+        if self.mp_type in {
+            "simple",
+            "updownscale",
+            "base",
+            "updownscale_base",
+            "local_env",
+        }:
+            e = self.act(self.lin_geom(e))  # TODO: remove act() ?
 
-        if self.mp_type == "updownscale":
-            h = self.lin_down(h)  # downscale node rep.
-            h = self.propagate(edge_index, x=h, W=W)  # propagate
-            h = self.lin_up(self.act(h))  # upscale node rep.
+        # --- Message Passing block --
+
+        if self.mp_type == "updownscale" or self.mp_type == "updownscale_base":
+            h = self.act(self.lin_down(h))  # downscale node rep.
+            h = self.propagate(edge_index, x=h, W=e)  # propagate
+            h = self.act(self.lin_up(h))  # upscale node rep.
 
         elif self.mp_type == "att":
-            h = self.lin_h(self.act(h))
-            h = self.lin_geom(h, edge_index, edge_attr=W)
+            h = self.lin_geom(h, edge_index, edge_attr=e)
+            h = self.act(self.lin_h(h))
+
         elif self.mp_type == "base_with_att":
-            h = self.lin_h(self.act(h))
-            h = self.lin_geom(h, edge_index, W)  # propagate is inside
+            h = self.lin_geom(h, edge_index, edge_attr=e)  # propagate is inside
+            h = self.act(self.lin_h(h))
 
         elif self.mp_type == "local_env":
-            h = self.lin_h(self.act(h))
-            chi = self.propagate(edge_index, x=h, W=W, local_env=True)  # propagate
-            h = self.propagate(edge_index, x=h, W=W)  # propagate
+            chi = self.propagate(edge_index, x=h, W=e, local_env=True)
+            h = self.propagate(edge_index, x=h, W=e)  # propagate
             h = h + chi
-            # h = h * chi
-        elif self.mp_type == "up_down_local_env":
-            h = self.lin_h(self.act(h))
-            chi = self.propagate(edge_index, x=h, W=W, local_env=True)  # propagate
-            h = self.propagate(edge_index, x=h, W=W)  # propagate
+            h = h = self.act(self.lin_h(h))
+
+        elif self.mp_type == "updown_local_env":
+            h = self.act(self.lin_down(h))
+            chi = self.propagate(edge_index, x=h, W=e, local_env=True)
+            e = self.lin_geom(e)
+            h = self.propagate(edge_index, x=h, W=e)  # propagate
             h = torch.cat((h, chi), dim=1)
-            h = self.lin_geom(h)
+            h = self.lin_up(h)
 
-        else:  # base, simple, sfarinet
-            h = self.lin_h(self.act(h))
-            h = self.propagate(edge_index, x=h, W=W)  # propagate
+        elif self.mp_type in {"base", "simple", "sfarinet"}:
+            h = self.propagate(edge_index, x=h, W=e)  # propagate
+            h = self.act(self.lin_h(h))
+
+        else:
+            raise ValueError("mp_type provided does not exist")
 
         return h
 
@@ -447,6 +463,7 @@ class FANet(BaseModel):
             could be num_filters or hidden_channels.
         mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}):
             specificies the MP of the interaction block.
+        batch_norm (bool): whether to apply batch norm after every linear layer.
     """
 
     def __init__(self, **kwargs):
@@ -460,7 +477,7 @@ def __init__(self, **kwargs):
         self.max_num_neighbors = kwargs["max_num_neighbors"]
         self.edge_embed_type = kwargs["edge_embed_type"]
         self.skip_co = kwargs["skip_co"]
-        if kwargs["mp_type"] == 'sfarinet':
+        if kwargs["mp_type"] == "sfarinet":
             kwargs["num_filters"] = kwargs["hidden_channels"]
 
         self.act = (
diff --git a/ocpmodels/models/utils/attention_model.py b/ocpmodels/models/utils/attention_model.py
index b280680f51..6ada3eebb1 100644
--- a/ocpmodels/models/utils/attention_model.py
+++ b/ocpmodels/models/utils/attention_model.py
@@ -1,12 +1,20 @@
-from typing import Optional, Union
+import math
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import Parameter
 from torch_geometric.nn.conv import MessagePassing
+from torch_geometric.nn.dense.linear import Linear
 from torch_geometric.nn.inits import glorot, zeros
-from torch_geometric.typing import Adj, OptPairTensor, OptTensor, Size
+from torch_geometric.typing import (
+    Adj,
+    OptPairTensor,
+    OptTensor,
+    PairTensor,
+    Size,
+)
 from torch_geometric.utils import softmax
 from torch_sparse import SparseTensor
 
@@ -140,3 +148,174 @@ def __repr__(self) -> str:
             f"{self.__class__.__name__}({self.in_channels}, "
             f"{self.hidden_channels}, heads={self.heads})"
         )
+
+
+class TransfoAttConv(MessagePassing):
+    r"""The graph transformer operator from the `"Masked Label Prediction:
+    Unified Message Passing Model for Semi-Supervised Classification"
+    <https://arxiv.org/abs/2009.03509>`_ paper
+
+    Args:
+        in_channels (int or tuple): Size of each input sample, or :obj:`-1` to
+            derive the size from the first input(s) to the forward method.
+            A tuple corresponds to the sizes of source and target
+            dimensionalities.
+        out_channels (int): Size of each output sample.
+        heads (int, optional): Number of multi-head-attentions.
+            (default: :obj:`1`)
+        concat (bool, optional): If set to :obj:`False`, the multi-head
+            attentions are averaged instead of concatenated.
+            (default: :obj:`True`)
+        beta (bool, optional): If set, will combine aggregation and
+            skip information via
+        dropout (float, optional): Dropout probability of the normalized
+            attention coefficients which exposes each node to a stochastically
+            sampled neighborhood during training. (default: :obj:`0`)
+        edge_dim (int, optional): Edge feature dimensionality (in case
+            there are any). Edge features are convoled with value features prior
+            multiplication by attention coefficient
+        bias (bool, optional): If set to :obj:`False`, the layer will not learn
+            an additive bias. (default: :obj:`True`)
+        root_weight (bool, optional): If set to :obj:`False`, the layer will
+            not add the transformed root node features to the output and the
+            option  :attr:`beta` is set to :obj:`False`. (default: :obj:`True`)
+        **kwargs (optional): Additional arguments of
+            :class:`torch_geometric.nn.conv.MessagePassing`.
+    """
+    _alpha: OptTensor
+
+    def __init__(
+        self,
+        in_channels: Union[int, Tuple[int, int]],
+        out_channels: int,
+        heads: int = 1,
+        concat: bool = True,
+        dropout: float = 0.0,
+        edge_dim: Optional[int] = None,
+        bias: bool = True,
+        root_weight: bool = True,
+        **kwargs,
+    ):
+        kwargs.setdefault("aggr", "add")
+        super(TransfoAttConv, self).__init__(node_dim=0, **kwargs)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.heads = heads
+        self.root_weight = root_weight
+        self.concat = concat
+        self.dropout = dropout
+        self.edge_dim = edge_dim
+        self._alpha = None
+
+        if isinstance(in_channels, int):
+            in_channels = (in_channels, in_channels)
+
+        self.lin_key = Linear(in_channels[0], heads * out_channels)
+        self.lin_query = Linear(in_channels[1], heads * out_channels)
+        self.lin_value = Linear(in_channels[0], heads * out_channels)
+        if edge_dim is not None:
+            self.lin_edge = Linear(edge_dim, heads * out_channels, bias=False)
+        else:
+            self.lin_edge = self.register_parameter("lin_edge", None)
+
+        if concat:
+            self.lin_skip = Linear(in_channels[1], heads * out_channels, bias=bias)
+        else:
+            self.lin_skip = Linear(in_channels[1], out_channels, bias=bias)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        self.lin_key.reset_parameters()
+        self.lin_query.reset_parameters()
+        self.lin_value.reset_parameters()
+        if self.edge_dim:
+            self.lin_edge.reset_parameters()
+        self.lin_skip.reset_parameters()
+
+    def forward(
+        self,
+        x: Union[Tensor, PairTensor],
+        edge_index: Adj,
+        edge_attr: OptTensor = None,
+        return_attention_weights=None,
+    ):
+        r"""
+        Args:
+            return_attention_weights (bool, optional): If set to :obj:`True`,
+                will additionally return the tuple
+                :obj:`(edge_index, attention_weights)`, holding the computed
+                attention weights for each edge. (default: :obj:`None`)
+        """
+
+        H, C = self.heads, self.out_channels
+
+        if isinstance(x, Tensor):
+            x: PairTensor = (x, x)
+
+        query = self.lin_query(x[1]).view(-1, H, C)
+        key = self.lin_key(x[0]).view(-1, H, C)
+        value = self.lin_value(x[0]).view(-1, H, C)
+
+        # propagate_type: (query: Tensor, key:Tensor, value: Tensor, edge_attr: OptTensor) # noqa
+        out = self.propagate(
+            edge_index,
+            query=query,
+            key=key,
+            value=value,
+            edge_attr=edge_attr,
+            size=None,
+        )
+
+        alpha = self._alpha
+        self._alpha = None
+
+        if self.concat:
+            out = out.view(-1, self.heads * self.out_channels)
+        else:
+            out = out.mean(dim=1)
+
+        if self.root_weight:
+            x_r = self.lin_skip(x[1])
+            out = out + x_r
+
+        if isinstance(return_attention_weights, bool):
+            assert alpha is not None
+            if isinstance(edge_index, Tensor):
+                return out, (edge_index, alpha)
+            elif isinstance(edge_index, SparseTensor):
+                return out, edge_index.set_value(alpha, layout="coo")
+        else:
+            return out
+
+    def message(
+        self,
+        query_i: Tensor,
+        key_j: Tensor,
+        value_j: Tensor,
+        edge_attr: OptTensor,
+        index: Tensor,
+        ptr: OptTensor,
+        size_i: Optional[int],
+    ) -> Tensor:
+
+        # Compute edge embed
+        if self.lin_edge is not None:
+            assert edge_attr is not None
+            edge_attr = self.lin_edge(edge_attr).view(-1, self.heads, self.out_channels)
+
+        # Compute attention coefficient
+        alpha = (query_i * key_j).sum(dim=-1) / math.sqrt(self.out_channels)
+        alpha = softmax(alpha, index, ptr, size_i)
+        self._alpha = alpha
+        alpha = F.dropout(alpha, p=self.dropout, training=self.training)
+
+        out = value_j * alpha.view(-1, self.heads, 1) * edge_attr
+        return out
+
+    def __repr__(self) -> str:
+        return (
+            f"{self.__class__.__name__}({self.in_channels}, "
+            f"{self.out_channels}, heads={self.heads})"
+        )

From 9cfc910b8312b9e7dab54b78b88e4e1c59297a75 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Thu, 12 Jan 2023 10:57:10 -0500
Subject: [PATCH 068/273] multiple frame averaging -- without dealing with
 inference

---
 ocpmodels/datasets/data_transforms.py      | 2 ++
 ocpmodels/preprocessing/frame_averaging.py | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/ocpmodels/datasets/data_transforms.py b/ocpmodels/datasets/data_transforms.py
index 8f838ecd7a..5229751a97 100644
--- a/ocpmodels/datasets/data_transforms.py
+++ b/ocpmodels/datasets/data_transforms.py
@@ -47,9 +47,11 @@ def __init__(self, fa_type=None, fa_frames=None):
             "random",
             "det",
             "all",
+            "multiple"
             "se3-random",
             "se3-det",
             "se3-all",
+            "se3-multiple"
         }
 
         if self.fa_type:
diff --git a/ocpmodels/preprocessing/frame_averaging.py b/ocpmodels/preprocessing/frame_averaging.py
index 5ef1af72c3..b7f2d54742 100644
--- a/ocpmodels/preprocessing/frame_averaging.py
+++ b/ocpmodels/preprocessing/frame_averaging.py
@@ -33,6 +33,7 @@ def all_frames(eigenvec, pos, cell, fa_frames="random", pos_3D=None, det_index=0
         "se3-all",
         "se3-random",
         "se3-det",
+        "se3-multiple",
     }
     fa_cell = deepcopy(cell)
 
@@ -76,6 +77,13 @@ def all_frames(eigenvec, pos, cell, fa_frames="random", pos_3D=None, det_index=0
     # Return frame(s) depending on method fa_frames
     if fa_frames == "all" or fa_frames == "se3-all":
         return all_fa_pos, all_cell, all_rots
+    
+    if fa_frames == "multiple" or fa_frames == "se3-multiple":
+        indexes = torch.bernoulli(torch.tensor([0.5] * len(all_fa_pos)))
+        all_fa_pos = [a for a, b in zip(all_fa_pos, indexes) if b]
+        all_cell = [a for a, b in zip(all_cell, indexes) if b]
+        all_rots = [a for a, b in zip(all_rots, indexes) if b]
+        return all_fa_pos, all_cell, all_rots
 
     elif fa_frames == "det" or fa_frames == "se3-det":
         return [all_fa_pos[det_index]], [all_cell[det_index]], [all_rots[det_index]]

From ae8468acbe51553a98a800100db68dc3fa488262 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Thu, 12 Jan 2023 12:48:07 -0500
Subject: [PATCH 069/273] fix FA multiple + complex mp + att heads + skip_co

---
 configs/exps/icml/test_params.yaml         | 13 +++--
 configs/models/fanet.yaml                  |  5 +-
 ocpmodels/common/flags.py                  | 12 ++++-
 ocpmodels/datasets/data_transforms.py      |  4 +-
 ocpmodels/models/fanet.py                  | 60 ++++++++++++++++------
 ocpmodels/preprocessing/frame_averaging.py | 18 ++++---
 scripts/gnn_dev.py                         |  7 ++-
 7 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/configs/exps/icml/test_params.yaml b/configs/exps/icml/test_params.yaml
index 85a48351e5..6c019ddc6d 100644
--- a/configs/exps/icml/test_params.yaml
+++ b/configs/exps/icml/test_params.yaml
@@ -15,14 +15,19 @@ default:
     pg_hidden_channels: 0  # shall have been 32
     energy_head: False # False ?
   optim:
-    max_epochs: 10
-  wandb_tags: 'prop-check-ICLM'
+    max_epochs: 5
+  wandb_tags: 'test-extension'
+  cp_data_to_tmpdir: true
 
 runs:
   - config: sfarinet-s2ef-2M
-    note: 'All No TMP 1 GPU with grad target'
+    note: 'Multiple FA with direct_with_gradient_target'
     model:
-      regress_forces: direct
+      regress_forces: direct_with_gradient_target
+      mp_type: base_with_att
+      skip_co: add
+      complex_mp: true
+      att_heads: 3
     optim:
       batch_size: 192
       eval_batch_size: 192
diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml
index 7426140a2b..7822609aac 100644
--- a/configs/models/fanet.yaml
+++ b/configs/models/fanet.yaml
@@ -16,10 +16,13 @@ default:
     phys_hidden_channels: 0
     energy_head: False # can be {False, weighted-av-initial-embeds, weighted-av-final-embeds, pooling, graclus, random}
     # fanet new features
-    skip_co: False # output skip connections
+    skip_co: False # output skip connections {False, "add", "concat"}
     second_layer_MLP: False # in EmbeddingBlock
+    complex_mp: False
     edge_embed_type: rij # {'rij','all_rij','sh', 'all'})
     mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}
+    batch_norm: False  # bool
+    att_heads: 1  # int
     force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True
     force_decoder_model_config:
       simple:
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 30024baa19..519465386b 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -202,7 +202,17 @@ def add_core_args(self):
             type=str,
             default="",
             help="Frame averaging method to use",
-            choices=["", "random", "det", "all", "se3-all", "se3-random", "se3-det"],
+            choices=[
+                "",
+                "random",
+                "det",
+                "all",
+                "se3-all",
+                "se3-random",
+                "se3-det",
+                "multiple",
+                "se3-multiple",
+            ],
         )
         self.parser.add_argument(
             "--graph_rewiring",
diff --git a/ocpmodels/datasets/data_transforms.py b/ocpmodels/datasets/data_transforms.py
index 5229751a97..1e85fff5c0 100644
--- a/ocpmodels/datasets/data_transforms.py
+++ b/ocpmodels/datasets/data_transforms.py
@@ -47,11 +47,11 @@ def __init__(self, fa_type=None, fa_frames=None):
             "random",
             "det",
             "all",
-            "multiple"
+            "multiple",
             "se3-random",
             "se3-det",
             "se3-all",
-            "se3-multiple"
+            "se3-multiple",
         }
 
         if self.fa_type:
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 5390027b97..fb84675096 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -5,6 +5,7 @@
 from torch import nn
 from torch.nn import Embedding, Linear
 from torch_geometric.nn import MessagePassing, TransformerConv, radius_graph
+from torch_geometric.nn.norm import BatchNorm, GraphNorm
 from torch_scatter import scatter
 
 from ocpmodels.common.registry import registry
@@ -230,11 +231,14 @@ def forward(
 
 
 class InteractionBlock(MessagePassing):
-    def __init__(self, hidden_channels, num_filters, act, mp_type):
+    def __init__(
+        self, hidden_channels, num_filters, act, mp_type, complex_mp, att_heads=1
+    ):
         super(InteractionBlock, self).__init__()
         self.act = act
         self.mp_type = mp_type
         self.hidden_channels = hidden_channels
+        self.complex_mp = complex_mp
 
         if self.mp_type == "simple":
             self.lin_geom = nn.Linear(num_filters, hidden_channels)
@@ -259,7 +263,7 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
             self.lin_geom = TransfoAttConv(
                 hidden_channels,
                 hidden_channels,
-                heads=1,
+                heads=att_heads,
                 concat=True,
                 root_weight=False,
                 edge_dim=num_filters,
@@ -269,7 +273,7 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
             self.lin_geom = TransformerConv(
                 hidden_channels,
                 hidden_channels,
-                heads=1,
+                heads=att_heads,
                 concat=True,
                 root_weight=False,
                 edge_dim=num_filters,
@@ -290,17 +294,26 @@ def __init__(self, hidden_channels, num_filters, act, mp_type):
             )
             self.lin_h = nn.Linear(hidden_channels, hidden_channels)
 
+        if self.complex_mp:
+            self.other_mlp = nn.Linear(hidden_channels, hidden_channels)
+
+        self.reset_parameters()
+
     def reset_parameters(self):
-        if self.mp_type != "sfarinet":
+        if self.mp_type not in {"sfarinet", "att", "base_with_att"}:
             nn.init.xavier_uniform_(self.lin_geom.weight)
             self.lin_geom.bias.data.fill_(0)
-        nn.init.xavier_uniform_(self.lin_h.weight)
-        self.lin_h.bias.data.fill_(0)
-        if self.mp_type == "updownscale":
+        if self.complex_mp:
+            nn.init.xavier_uniform_(self.other_mlp.weight)
+            self.other_mlp.bias.data.fill_(0)
+        if self.mp_type in {"updownscale", "base_updownscale", "updown_local_env"}:
             nn.init.xavier_uniform_(self.lin_up.weight)
             self.lin_up.bias.data.fill_(0)
             nn.init.xavier_uniform_(self.lin_down.weight)
             self.lin_down.bias.data.fill_(0)
+        else:
+            nn.init.xavier_uniform_(self.lin_h.weight)
+            self.lin_h.bias.data.fill_(0)
 
     def forward(self, h, edge_index, e):
 
@@ -353,6 +366,9 @@ def forward(self, h, edge_index, e):
         else:
             raise ValueError("mp_type provided does not exist")
 
+        if self.complex_mp:
+            h = self.act(self.other_mlp(h))
+
         return h
 
     def message(self, x_j, W, local_env=None):
@@ -454,16 +470,18 @@ class FANet(BaseModel):
             (default: :obj:`4`)
         num_gaussians (int): The number of gaussians :math:`\mu`.
             (default: :obj:`50`)
-        second_layer_MLP (bool): use 2-layers MLP at the end of embedding block.
-        skip_co (bool): add a skip connection between interaction blocks and
+        second_layer_MLP (bool): use 2-layers MLP at the end of the Embedding block.
+        skip_co (str): add a skip connection between each interaction block and
             energy-head.
         edge_embed_type (str, in {'rij','all_rij','sh', 'all'}): input feature
             of the edge embedding block.
         edge_embed_hidden (int): size of edge representation.
             could be num_filters or hidden_channels.
-        mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}):
+        mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'
+            'updownscale_base', 'updownscale', 'updown_local_env', 'sfarinet'}}):
             specificies the MP of the interaction block.
         batch_norm (bool): whether to apply batch norm after every linear layer.
+        complex_mp (bool); whether to add a second layer MLP at the end of each Interaction
     """
 
     def __init__(self, **kwargs):
@@ -516,6 +534,8 @@ def __init__(self, **kwargs):
                     kwargs["num_filters"],
                     self.act,
                     kwargs["mp_type"],
+                    kwargs["complex_mp"],
+                    kwargs["att_heads"],
                 )
                 for _ in range(kwargs["num_interactions"])
             ]
@@ -542,6 +562,10 @@ def __init__(self, **kwargs):
             else None
         )
 
+        # Skip co
+        if self.skip_co == "concat":
+            self.mlp_skip_co = Linear((kwargs["num_interactions"] + 1), 1)
+
     @conditional_grad(torch.enable_grad())
     def forces_forward(self, preds):
         return self.decoder(preds["hidden_state"])
@@ -598,21 +622,25 @@ def energy_forward(self, data):
             alpha = self.w_lin(h)
         else:
             alpha = None
-        energy_skip_co = torch.zeros(max(batch) + 1, device=h.device).unsqueeze(1)
 
         # Interaction blocks
+        energy_skip_co = []
         for interaction in self.interaction_blocks:
             if self.skip_co:
-                energy_skip_co += self.output_block(
-                    h, edge_index, edge_weight, batch, alpha
+                energy_skip_co.append(
+                    self.output_block(h, edge_index, edge_weight, batch, alpha)
                 )
             h = h + interaction(h, edge_index, e)
 
         # Output block
         energy = self.output_block(h, edge_index, edge_weight, batch, alpha)
-        # skip-connection
-        if self.skip_co:
-            energy += energy_skip_co
+
+        # Skip-connection
+        energy_skip_co.append(energy)
+        if self.skip_co == "concat":
+            energy = self.mlp_skip_co(torch.cat(energy_skip_co, dim=1))
+        else:
+            energy = energy_skip_co.sum()
 
         preds = {"energy": energy, "pooling_loss": pooling_loss, "hidden_state": h}
 
diff --git a/ocpmodels/preprocessing/frame_averaging.py b/ocpmodels/preprocessing/frame_averaging.py
index b7f2d54742..198cff6933 100644
--- a/ocpmodels/preprocessing/frame_averaging.py
+++ b/ocpmodels/preprocessing/frame_averaging.py
@@ -77,13 +77,19 @@ def all_frames(eigenvec, pos, cell, fa_frames="random", pos_3D=None, det_index=0
     # Return frame(s) depending on method fa_frames
     if fa_frames == "all" or fa_frames == "se3-all":
         return all_fa_pos, all_cell, all_rots
-    
+
     if fa_frames == "multiple" or fa_frames == "se3-multiple":
-        indexes = torch.bernoulli(torch.tensor([0.5] * len(all_fa_pos)))
-        all_fa_pos = [a for a, b in zip(all_fa_pos, indexes) if b]
-        all_cell = [a for a, b in zip(all_cell, indexes) if b]
-        all_rots = [a for a, b in zip(all_rots, indexes) if b]
-        return all_fa_pos, all_cell, all_rots
+        index = torch.bernoulli(torch.tensor([0.5] * len(all_fa_pos)))
+        if index.sum() == 0:
+            index = random.randint(0, len(all_fa_pos) - 1)
+            return [all_fa_pos[index]], [all_cell[index]], [all_rots[index]]
+        if index.sum() == 1:
+            return [all_fa_pos[index]], [all_cell[index]], [all_rots[index]]
+        else:
+            all_fa_pos = [a for a, b in zip(all_fa_pos, index) if b]
+            all_cell = [a for a, b in zip(all_cell, index) if b]
+            all_rots = [a for a, b in zip(all_rots, index) if b]
+            return all_fa_pos, all_cell, all_rots
 
     elif fa_frames == "det" or fa_frames == "se3-det":
         return [all_fa_pos[det_index]], [all_cell[det_index]], [all_rots[det_index]]
diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py
index 2797921988..9660de9af1 100644
--- a/scripts/gnn_dev.py
+++ b/scripts/gnn_dev.py
@@ -19,12 +19,15 @@
     # Customize args
     config["graph_rewiring"] = "remove-tag-0"
     config["frame_averaging"] = "2D"
-    config["fa_frames"] = "random"  # "random"
+    config["fa_frames"] = "all"  # "random"
     config["test_ri"] = True
     config["optim"] = {"max_epochs": 1}
     config["model"] = {"use_pbc": True}
-    config["model"]["edge_embed_type"] = "rij"
+    config["model"]["edge_embed_type"] = "all_rij"
     config["model"]["mp_type"] = "att"
+    config["model"]["skip_co"] = "add"
+    config["model"]["complex_mp"] = True
+    # config["model"]["regress_forces"] = "direct_with_gradient_target"
 
     checkpoint_path = None
     # "checkpoints/2022-04-28-11-42-56-dimenetplusplus/" + "best_checkpoint.pt"

From 32368505bcbc583601a463868d321e85f25d7d13 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Thu, 12 Jan 2023 13:03:31 -0500
Subject: [PATCH 070/273] fix all embedding and start batch norm

---
 ocpmodels/models/fanet.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index fb84675096..aaa944d4c6 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -122,7 +122,7 @@ def __init__(
         elif self.edge_embed_type == "sh":
             self.lin_e1 = Linear(15, num_filters)
         elif self.edge_embed_type == "all":
-            self.lin_e1 = Linear(15, num_filters)
+            self.lin_e1 = Linear(15 + num_gaussians, num_filters)
         else:
             raise ValueError("edge_embedding_type does not exist")
 
@@ -180,7 +180,7 @@ def forward(
                 normalize=False,
                 normalization="component",
             )
-            e = torch.cat((rel_pos, self.sh), dim=1)
+            e = torch.cat((rel_pos, self.sh, edge_attr), dim=1)
             e = self.lin_e1(e)
 
         e = self.act(e)  # can comment out
@@ -232,7 +232,7 @@ def forward(
 
 class InteractionBlock(MessagePassing):
     def __init__(
-        self, hidden_channels, num_filters, act, mp_type, complex_mp, att_heads=1
+        self, hidden_channels, num_filters, act, mp_type, complex_mp, att_heads, batch_norm
     ):
         super(InteractionBlock, self).__init__()
         self.act = act
@@ -536,6 +536,7 @@ def __init__(self, **kwargs):
                     kwargs["mp_type"],
                     kwargs["complex_mp"],
                     kwargs["att_heads"],
+                    kwargs["batch_norm"]
                 )
                 for _ in range(kwargs["num_interactions"])
             ]

From 8a2555ed4b997297e8292231abd7db65d6fcbb5f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 12 Jan 2023 13:53:34 -0500
Subject: [PATCH 071/273] typo in `LinearWarmupCosineAnnealingLR`

---
 ocpmodels/modules/scheduler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index 7c5c01ac1a..8a4d082188 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -43,7 +43,7 @@ def scheduler_lambda_fn(x):
             self.scheduler = getattr(lr_scheduler, self.scheduler_type)
             scheduler_args = self.filter_kwargs(self.optim_config)
             self.scheduler = self.scheduler(optimizer, **scheduler_args)
-        elif self.scheduler_type == "WarmupCosineAnnealingLR":
+        elif self.scheduler_type == "LinearWarmupCosineAnnealingLR":
             self.warmup_scheduler = warmup.ExponentialWarmup(
                 self.optimizer, warmup_period=self.optim_config["warmup_steps"]
             )

From 1161ed60b3cb857732145c958d5539497b84e1dd Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 12 Jan 2023 16:51:33 -0500
Subject: [PATCH 072/273] exp manager

---
 ocpmodels/common/exp_manager.py | 230 ++++++++++++++++++++++++++++++++
 ocpmodels/common/utils.py       |   4 +-
 2 files changed, 233 insertions(+), 1 deletion(-)
 create mode 100644 ocpmodels/common/exp_manager.py

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
new file mode 100644
index 0000000000..02459ba8ed
--- /dev/null
+++ b/ocpmodels/common/exp_manager.py
@@ -0,0 +1,230 @@
+from orion.client import get_experiment
+from pathlib import Path
+from collections import defaultdict, Counter
+import wandb
+from textwrap import dedent
+from minydra import resolved_args
+import os
+import sys
+
+rundir = Path(os.environ["SCRATCH"]) / "ocp" / "runs"
+
+
+class Manager:
+    def __init__(
+        self,
+        orion_db_path="",
+        name="",
+        wandb_path="mila-ocp/ocp-qm",
+    ):
+        self.api = wandb.Api()
+        self.wandb_path = wandb_path
+        self.wandb_runs = [
+            r
+            for r in self.api.runs(wandb_path)
+            if "orion_hash_params" in r.config
+            and name in r.config.get("orion_exp_config_path", "")
+        ]
+        self.name = name
+        self.trial_hparams_to_rundirs = defaultdict(list)
+        self.exp = get_experiment(
+            name=name,
+            storage={
+                "database": {
+                    "host": str(orion_db_path),
+                    "type": "pickleddb",
+                }
+            },
+        )
+        self.trials = self.exp.fetch_trials()
+        self.budgets = self.exp.algorithms.algorithm.budgets
+        self.total_budgets = sum(
+            b.n_trials for bracket in self.budgets for b in bracket
+        )
+        self.id_to_trial = {t.id: t for t in self.trials}
+        self.id_to_wandb_runs = {
+            t.id: sorted(
+                [
+                    r
+                    for r in self.wandb_runs
+                    if r.config["orion_hash_params"] == t.hash_params
+                ],
+                key=lambda r: r.config["job_id"],
+            )
+            for t in self.trials
+        }
+        self.hash_to_trials = defaultdict(list)
+        for t in self.trials:
+            self.hash_to_trials[t.hash_params].append(t)
+        self.discover_run_dirs()
+        print(Manager.help())
+        print("\n")
+        print("{:31} : {:4} ".format("Trials in experiment", len(self.trials)))
+        print("{:31} : {:4}".format("Total expected trials", self.total_budgets))
+        print(
+            "{:31} : {:4} ".format(
+                "Trials status",
+                " ".join(
+                    [
+                        f"{k}->{v}"
+                        for k, v in Counter([t.status for t in self.trials]).items()
+                    ]
+                ),
+            )
+        )
+        print(
+            "{:31} : {}".format(
+                "Trial level(=rung) distribution",
+                " ".join(
+                    [
+                        f"{k}->{v}"
+                        for k, v in Counter(
+                            map(len, self.hash_to_trials.values())
+                        ).items()
+                    ]
+                ),
+            )
+        )
+        print(
+            "{:31} : {:4}".format(
+                "Existing unique trials executed", len(self.trial_hparams_to_rundirs)
+            )
+        )
+        print(
+            "{:31} : {:4}".format(
+                "Total existing trial run dirs",
+                sum(len(v) for v in self.trial_hparams_to_rundirs.values()),
+            )
+        )
+        print("{:31} : {:4}".format("Existing wandb runs", len(self.wandb_runs)))
+        print("{:31} : {}".format("Algorithm's budgets", str(self.budgets)))
+
+    def discover_run_dirs(self):
+        for unique in rundir.glob("*/*.unique"):
+            self.trial_hparams_to_rundirs[unique.stem.split("--")[-1]].append(
+                unique.parent
+            )
+
+    def get_dirs_for_trial(self, trial):
+        if trial.hash_params in self.trial_hparams_to_rundirs:
+            return self.trial_hparams_to_rundirs[trial.hash_params]
+        else:
+            print(f"No run dir for this trial with hparams {trial.hash_params}.")
+
+    def get_trial_for_id(self, id):
+        if id in self.id_to_trial:
+            return self.id_to_trial[id]
+        else:
+            print("No trial for this id.")
+
+    def get_dirs_for_id(self, id):
+        return self.get_dirs_for_trial(self.get_trial_for_id(id))
+
+    def get_reserved_wandb_runs(self):
+        reserved = {}
+        for trial_id, wandb_runs in self.id_to_wandb_runs.items():
+            trial = self.get_trial_for_id(trial_id)
+            if trial.status == "reserved":
+                reserved[trial_id] = {"wandb_run": wandb_runs, "trial": trial}
+        return reserved
+
+    def print_wandb_query(self):
+        print(
+            "WandB runs query:\n"
+            + "("
+            + "|".join(
+                sorted(
+                    [
+                        p.name
+                        for runs in self.trial_hparams_to_rundirs.values()
+                        for p in runs
+                    ]
+                )
+            )
+            + ")"
+        )
+
+    @classmethod
+    def help(self):
+        return dedent(
+            """\
+        --------------
+        Manager init()
+        --------------
+
+        orion_db_path -> (str or pathlib.Path) pointing to the orion db pickle file
+        name          -> (str) unique orion experiment name in the db
+        wandb_path    -> (str) path to the wandb project like "{entity}/{project}"
+
+        ----------
+        Attributes
+        ----------
+
+        manager.trial_hparams_to_rundirs  -> dict {trial.params_hash: [list of run dirs]}
+        manager.exp                       -> Orion experiment object
+        manager.trials                    -> list of Orion trial objects for this exp
+        manager.budgets                   -> list of budget of the exp's algorithm: n_trials and resources associated
+        manager.total_budgets             -> total number of trials expected for this exp
+        manager.id_to_trial               -> dict {trial_id: trial}
+        manager.id_to_wandb_runs          -> dict {trial_id: [list of wandb Run objects]}
+        manager.hash_to_trials             -> dict {hash_params: [list Orion trial objects]}
+
+        -------
+        Methods
+        -------
+
+        manager.get_dirs_for_trial(trial_obj: orion.Trial) -> list of run dirs for this trial
+        manager.get_trial_for_id(trial_id: str)            -> trial object for this trial_id (wrapper around manager.id_to_trial[trial_id])
+        manager.get_dirs_for_id(trial_id: str)             -> list of run dirs for this trial_id
+        manager.get_reserved_wandb_runs()                  -> dict {trial_id: {"wandb_run": [list of wandb Run objects], "trial": trial}}
+                                                              get the currently reserved trials and their wandb runs
+
+        --------
+        Examples
+        --------
+
+        m = Manager(orion_db_path="./data/orion/storage/orion_db.pkl", name="ocp-qm9-orion-debug-v1.0.0", wandb_path="mila-ocp/ocp-qm")
+        exp_df = m.exp.to_pandas()
+        reserved_wandbs = m.get_reserved_wandb_runs()
+        print(list(reserved_wandbs.values())[0]["wandb_run"][0].config["run_dir"])
+        """
+        )
+
+
+if __name__ == "__main__":
+    defaults = {
+        "help": False,
+        "name": None,
+        "wandb_path": None,
+        "orion_db_path": str(
+            Path(__file__).resolve().parent.parent.parent
+            / "data/orion/storage/orion_db.pkl"
+        ),
+    }
+    args = resolved_args(defaults=defaults)
+    if args.help:
+        print("🖱 Command-line (default) parameters:")
+        print("\n".join("  {:15} : {}".format(k, v) for k, v in defaults.items()))
+        print("\n\n🐍 Example command-line in IPython:")
+        print(
+            "In [1]: run ocpmodels/common/exp_manager.py",
+            "name='ocp-qm9-orion-debug-v1.0.0' wandb_path='mila-ocp/ocp-3'",
+        )
+        print("\n\n🧞 Manager help:")
+        print(Manager.help())
+        sys.exit(0)
+
+    if not args.name:
+        raise ValueError("Please provide a name for the experiment.")
+    if not args.wandb_path:
+        raise ValueError("Please provide a wandb_path.")
+
+    m = Manager(
+        name=args.name,
+        wandb_path=args.wandb_path,
+        orion_db_path=args.orion_db_path,
+    )
+
+    m.print_wandb_query()
+    exp_df = m.exp.to_pandas()
+    reserved_wandbs = m.get_reserved_wandb_runs()
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 7945a15121..8851943af5 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -52,10 +52,12 @@ def load_orion_exp(args):
     ), "Must provide orion_unique_exp_name in the command-line or the config file."
 
     print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}")
+    db_path = ROOT / "data" / "orion" / "storage" / "orion_db.pkl"
+    db_path.parent.mkdir(parents=True, exist_ok=True)
     experiment = build_experiment(
         storage={
             "database": {
-                "host": str(ROOT / "data" / "orion" / "storage" / "orion_db.pkl"),
+                "host": str(db_path),
                 "type": "pickleddb",
             }
         },

From d71d64a9deb5d4324af82fcdbddf99c92e0bbee5 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 12 Jan 2023 16:59:44 -0500
Subject: [PATCH 073/273] add dummy exp

---
 configs/exps/debug/dummy.yaml | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)
 create mode 100644 configs/exps/debug/dummy.yaml

diff --git a/configs/exps/debug/dummy.yaml b/configs/exps/debug/dummy.yaml
new file mode 100644
index 0000000000..8f3b7b570b
--- /dev/null
+++ b/configs/exps/debug/dummy.yaml
@@ -0,0 +1,27 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 24GB
+  cpus: 4
+  gres: gpu:1
+  time: 30:00
+  partition: unkillable
+
+default:
+  wandb_project: ocp-debug
+  config: schnet-qm9-all
+  mode: train
+  wandb_tags: qm9, debug
+  optim:
+    batch_size: 64
+    max_epochs: -1
+    max_steps: 1e3
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+
+runs:
+  - model:
+    hidden_channels: 128
+  - model:
+    hidden_channels: 64
\ No newline at end of file

From daeab01dcf2bdc1f774ee67ea235c0d31ad26a7c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 12 Jan 2023 16:59:56 -0500
Subject: [PATCH 074/273] dummy uses main

---
 configs/exps/debug/dummy.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/debug/dummy.yaml b/configs/exps/debug/dummy.yaml
index 8f3b7b570b..9da5053589 100644
--- a/configs/exps/debug/dummy.yaml
+++ b/configs/exps/debug/dummy.yaml
@@ -4,7 +4,7 @@ job:
   cpus: 4
   gres: gpu:1
   time: 30:00
-  partition: unkillable
+  partition: main
 
 default:
   wandb_project: ocp-debug

From 6e7c024fbcb65dc3b793e4bfcce638a1d1a6c4b7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 12 Jan 2023 17:55:15 -0500
Subject: [PATCH 075/273] improve exp manager

---
 ocpmodels/common/exp_manager.py | 67 ++++++++++++++++++++++-----------
 sbatch.py                       | 17 +++++++++
 2 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 02459ba8ed..1bbe5dd1b0 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -57,7 +57,9 @@ def __init__(
         for t in self.trials:
             self.hash_to_trials[t.hash_params].append(t)
         self.discover_run_dirs()
-        print(Manager.help())
+        self.job_ids = sorted(
+            [p.name for runs in self.trial_hparams_to_rundirs.values() for p in runs]
+        )
         print("\n")
         print("{:31} : {:4} ".format("Trials in experiment", len(self.trials)))
         print("{:31} : {:4}".format("Total expected trials", self.total_budgets))
@@ -87,7 +89,7 @@ def __init__(
         )
         print(
             "{:31} : {:4}".format(
-                "Existing unique trials executed", len(self.trial_hparams_to_rundirs)
+                "Existing unique HP sets executed", len(self.trial_hparams_to_rundirs)
             )
         )
         print(
@@ -98,9 +100,33 @@ def __init__(
         )
         print("{:31} : {:4}".format("Existing wandb runs", len(self.wandb_runs)))
         print("{:31} : {}".format("Algorithm's budgets", str(self.budgets)))
+        sq = set(
+            [
+                j.strip()
+                for j in os.popen("/opt/slurm/bin/squeue -u $USER -o '%12i'")
+                .read()
+                .splitlines()[1:]
+            ]
+        )
+        running = set(self.job_ids) & sq
+        waiting = (
+            set([j.parent.name for j in rundir.glob(f"*/{self.name}.exp")]) & sq
+        ) - running
+        print(
+            "{:31} : {}".format(
+                "Jobs currently running:",
+                f"{len(running)} " + " ".join(running),
+            )
+        )
+        print(
+            "{:31} : {}".format(
+                "Jobs currently waiting:",
+                f"{len(waiting)} " + " ".join(waiting),
+            )
+        )
 
     def discover_run_dirs(self):
-        for unique in rundir.glob("*/*.unique"):
+        for unique in rundir.glob(f"*/{self.name}--*.unique"):
             self.trial_hparams_to_rundirs[unique.stem.split("--")[-1]].append(
                 unique.parent
             )
@@ -125,24 +151,11 @@ def get_reserved_wandb_runs(self):
         for trial_id, wandb_runs in self.id_to_wandb_runs.items():
             trial = self.get_trial_for_id(trial_id)
             if trial.status == "reserved":
-                reserved[trial_id] = {"wandb_run": wandb_runs, "trial": trial}
+                reserved[trial_id] = {"wandb_runs": wandb_runs, "trial": trial}
         return reserved
 
     def print_wandb_query(self):
-        print(
-            "WandB runs query:\n"
-            + "("
-            + "|".join(
-                sorted(
-                    [
-                        p.name
-                        for runs in self.trial_hparams_to_rundirs.values()
-                        for p in runs
-                    ]
-                )
-            )
-            + ")"
-        )
+        print("WandB runs query:\n" + "(" + "|".join(self.job_ids) + ")")
 
     @classmethod
     def help(self):
@@ -176,7 +189,7 @@ def help(self):
         manager.get_dirs_for_trial(trial_obj: orion.Trial) -> list of run dirs for this trial
         manager.get_trial_for_id(trial_id: str)            -> trial object for this trial_id (wrapper around manager.id_to_trial[trial_id])
         manager.get_dirs_for_id(trial_id: str)             -> list of run dirs for this trial_id
-        manager.get_reserved_wandb_runs()                  -> dict {trial_id: {"wandb_run": [list of wandb Run objects], "trial": trial}}
+        manager.get_reserved_wandb_runs()                  -> dict {trial_id: {"wandb_runs": [list of wandb Run objects], "trial": trial}}
                                                               get the currently reserved trials and their wandb runs
 
         --------
@@ -186,7 +199,7 @@ def help(self):
         m = Manager(orion_db_path="./data/orion/storage/orion_db.pkl", name="ocp-qm9-orion-debug-v1.0.0", wandb_path="mila-ocp/ocp-qm")
         exp_df = m.exp.to_pandas()
         reserved_wandbs = m.get_reserved_wandb_runs()
-        print(list(reserved_wandbs.values())[0]["wandb_run"][0].config["run_dir"])
+        print(list(reserved_wandbs.values())[0]["wandb_runs"][0].config["run_dir"])
         """
         )
 
@@ -215,10 +228,20 @@ def help(self):
         sys.exit(0)
 
     if not args.name:
-        raise ValueError("Please provide a name for the experiment.")
+        raise ValueError(
+            "Please provide `name=` for the experiment."
+            + " See `$ python exp_manager.py help`"
+        )
     if not args.wandb_path:
-        raise ValueError("Please provide a wandb_path.")
+        raise ValueError(
+            "Please provide `wandb_path='{entity}/{project}}'`."
+            + " See `$ python exp_manager.py help`"
+        )
 
+    print(
+        "💃 Status of experiment",
+        f"'{args.name}' and wandb entity/project '{args.wandb_path}':",
+    )
     m = Manager(
         name=args.name,
         wandb_path=args.wandb_path,
diff --git a/sbatch.py b/sbatch.py
index bb0ff8ab36..ea97c8a7df 100644
--- a/sbatch.py
+++ b/sbatch.py
@@ -6,6 +6,7 @@
 from shutil import copyfile
 import sys
 import re
+import yaml
 
 template = """\
 #!/bin/bash
@@ -192,6 +193,21 @@ def add_jobid_to_log(j, command_line, exp_name=None):
     logfile.write_text("\n".join(lines))
 
 
+def write_orion_config(args, outdir):
+    if "--orion_exp_config_path=" not in args.get("py_args", ""):
+        return
+    orion_yaml_path = (
+        args.py_args.split("--orion_exp_config_path=")[-1]
+        .split(" --")[0]
+        .replace("'", "")
+    )
+    copyfile(orion_yaml_path, outdir / "orion_exp_config.yaml")
+    config = yaml.safe_load(Path(orion_yaml_path).read_text())
+    if "unique_exp_name" in config:
+        unique_exp_name = config["unique_exp_name"]
+        (outdir / f"{unique_exp_name}.exp").touch()
+
+
 if __name__ == "__main__":
     # has the submission been successful?
     success = False
@@ -348,6 +364,7 @@ def add_jobid_to_log(j, command_line, exp_name=None):
                     print("Creating directory", str(output_parent))
                 output_parent.mkdir(parents=True, exist_ok=True)
             copyfile(script_path, output_parent / script_path.name)
+            write_orion_config(args, output_parent)
         if not args.verbose:
             print("Submitted batch job", jobid)
         add_jobid_to_log(jobid, sbatch_command_line, args.exp_name)

From b0fc220953a6bf865535728614de6b586a37800c Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Fri, 13 Jan 2023 07:43:30 -0500
Subject: [PATCH 076/273] fix skip co

---
 configs/exps/gnn/edge_embed_type.yaml      |  35 +++-
 configs/exps/gnn/edge_embed_type_s2ef.yaml |   2 +-
 configs/exps/gnn/mp_type_3.yaml            | 182 +++++++++++++++++++++
 ocpmodels/models/fanet.py                  |  20 ++-
 4 files changed, 228 insertions(+), 11 deletions(-)
 create mode 100644 configs/exps/gnn/mp_type_3.yaml

diff --git a/configs/exps/gnn/edge_embed_type.yaml b/configs/exps/gnn/edge_embed_type.yaml
index b7a0418fe2..9354db6d8a 100644
--- a/configs/exps/gnn/edge_embed_type.yaml
+++ b/configs/exps/gnn/edge_embed_type.yaml
@@ -22,19 +22,42 @@ default:
   cp_data_to_tmpdir: true
 
 runs:
-  - config: sfarinet-is2re-all
-    note: 'Sfarinet baseline sym'
-    frame_averaging: 2D
-    fa_frames: se3-random
-  - config: sfarinet-is2re-all
+  - config: fanet-is2re-all  # 2678275
     note: 'all rij'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       edge_embed_type: all_rij
-  - config: sfarinet-is2re-all
+      mp_type: base
+  - config: fanet-is2re-all  # 2678276
     note: 'all'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       edge_embed_type: all
+      mp_type: base
+  - config: sfarinet-is2re-all  # 2678277
+    note: 'all rij sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all_rij
+      mp_type: sfarinet
+  - config: sfarinet-is2re-all  # 2678278
+    note: 'sfarinet all'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all
+      mp_type: sfarinet
+  - config: sfarinet-is2re-all  # 2678279
+    note: 'sfarinet all'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      edge_embed_type: all
+      mp_type: base
+      skip_co: "concat"
+      complex_mp: true
+      batch_norm: true
+      second_layer_mlp: true
\ No newline at end of file
diff --git a/configs/exps/gnn/edge_embed_type_s2ef.yaml b/configs/exps/gnn/edge_embed_type_s2ef.yaml
index 5ad120c07d..5b642e33c8 100644
--- a/configs/exps/gnn/edge_embed_type_s2ef.yaml
+++ b/configs/exps/gnn/edge_embed_type_s2ef.yaml
@@ -14,7 +14,7 @@ default:
     phys_embeds: True
     tag_hidden_channels: 64
     pg_hidden_channels: 0  # shall have been 32
-    energy_head: 'weighted-av-initial-embeds' # False ?
+    energy_head: False # False ?
     regress_forces: direct_with_gradient_target
   wandb_tags: 's2ef-archi-tests'
   optim:
diff --git a/configs/exps/gnn/mp_type_3.yaml b/configs/exps/gnn/mp_type_3.yaml
new file mode 100644
index 0000000000..8ba039780b
--- /dev/null
+++ b/configs/exps/gnn/mp_type_3.yaml
@@ -0,0 +1,182 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0
+    energy_head: False 
+    edge_embed_type: all_rij
+  wandb_tags: 'mp-type'
+  optim:
+    max_epochs: 10
+    batch_size: 256
+    eval_batch_size: 256
+  cp_data_to_tmpdir: true
+
+runs:
+  - config: fanet-is2re-all
+    note: 'simple'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: simple
+  - config: fanet-is2re-all
+    note: 'updown_scale'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updown_scale
+  - config: fanet-is2re-all
+    note: 'local_env'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: local_env
+  - config: fanet-is2re-all
+    note: 'updown_local_env'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updown_local_env
+  - config: fanet-is2re-all
+    note: 'base_with_att'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base_with_att
+  - config: fanet-is2re-all
+    note: 'att'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: att
+  - config: fanet-is2re-all
+    note: 'base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+  - config: fanet-is2re-all
+    note: 'updownscale_base'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+  - config: fanet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+      
+  - config: fanet-is2re-all
+    note: 'sfarinet'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+      skip_co: concat
+  - config: fanet-is2re-all
+    note: 'att skip co'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: att
+      skip_co: concat
+  - config: fanet-is2re-all
+    note: 'local_env add'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: local_env
+      skip_co: add
+  - config: fanet-is2re-all
+    note: 'base complex mp'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+      complex_mp: true
+  - config: fanet-is2re-all
+    note: 'simple complex mp'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: simple
+      complex_mp: true
+  - config: fanet-is2re-all
+    note: 'updown_local_env'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updown_local_env
+      second_layer_mlp: true
+  - config: fanet-is2re-all
+    note: 'base_with_att'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base_with_att
+      second_layer_mlp: true
+  - config: fanet-is2re-all
+    note: 'sfarinet '
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+      batch_norm: true
+  - config: fanet-is2re-all
+    note: 'base_updownscale'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base_updownscale
+      batch_norm: true
+  - config: fanet-is2re-all
+    note: 'simple bigger layers'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: simple
+      num_filters: 500
+      num_gaussians: 200
+      hidden_channels: 500
+      num_interactions: 4
+      tag_hidden_channels: 128
+  - config: fanet-is2re-all
+    note: 'more interactions and bigger filters'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: simple
+      num_filters: 500
+      num_gaussians: 100
+      num_interactions: 6
+      tag_hidden_channels: 128
+  - config: fanet-is2re-all
+    note: 'smaller lr and bigger gamma'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    optim: 
+      lr_initial: 0.0005
+      lr_gamma: 0.4
+  - config: fanet-is2re-all
+    note: 'bigger cutoff'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model: 
+      cutoff: 10.0
+  - config: fanet-is2re-all
+    note: 'DA'
+    frame_averaging: DA
+    optim: 
+      max_epochs: 15
\ No newline at end of file
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index aaa944d4c6..471e7c227e 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -232,13 +232,23 @@ def forward(
 
 class InteractionBlock(MessagePassing):
     def __init__(
-        self, hidden_channels, num_filters, act, mp_type, complex_mp, att_heads, batch_norm
+        self,
+        hidden_channels,
+        num_filters,
+        act,
+        mp_type,
+        complex_mp,
+        att_heads,
+        batch_norm,
     ):
         super(InteractionBlock, self).__init__()
         self.act = act
         self.mp_type = mp_type
         self.hidden_channels = hidden_channels
         self.complex_mp = complex_mp
+        self.batch_norm = batch_norm
+        if batch_norm:
+            self.graph_norm = GraphNorm(hidden_channels)
 
         if self.mp_type == "simple":
             self.lin_geom = nn.Linear(num_filters, hidden_channels)
@@ -360,6 +370,8 @@ def forward(self, h, edge_index, e):
             h = self.lin_up(h)
 
         elif self.mp_type in {"base", "simple", "sfarinet"}:
+            if self.batch_norm:
+                h = self.graph_norm(h)
             h = self.propagate(edge_index, x=h, W=e)  # propagate
             h = self.act(self.lin_h(h))
 
@@ -536,7 +548,7 @@ def __init__(self, **kwargs):
                     kwargs["mp_type"],
                     kwargs["complex_mp"],
                     kwargs["att_heads"],
-                    kwargs["batch_norm"]
+                    kwargs["batch_norm"],
                 )
                 for _ in range(kwargs["num_interactions"])
             ]
@@ -640,8 +652,8 @@ def energy_forward(self, data):
         energy_skip_co.append(energy)
         if self.skip_co == "concat":
             energy = self.mlp_skip_co(torch.cat(energy_skip_co, dim=1))
-        else:
-            energy = energy_skip_co.sum()
+        elif self.skip_co == "add":
+            energy = sum(energy_skip_co)
 
         preds = {"energy": energy, "pooling_loss": pooling_loss, "hidden_state": h}
 

From 241819d09429c45f52b50c21096dfa4aa0f29afc Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Fri, 13 Jan 2023 10:29:50 -0500
Subject: [PATCH 077/273] batch norm for all

---
 configs/exps/gnn/batch_norm.yaml | 53 ++++++++++++++++++++++++++++++++
 ocpmodels/models/fanet.py        | 14 +++++++--
 scripts/gnn_dev.py               |  8 +++--
 3 files changed, 70 insertions(+), 5 deletions(-)
 create mode 100644 configs/exps/gnn/batch_norm.yaml

diff --git a/configs/exps/gnn/batch_norm.yaml b/configs/exps/gnn/batch_norm.yaml
new file mode 100644
index 0000000000..4655fa3a9b
--- /dev/null
+++ b/configs/exps/gnn/batch_norm.yaml
@@ -0,0 +1,53 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    phys_embeds: True
+    tag_hidden_channels: 64
+    pg_hidden_channels: 0
+    energy_head: False
+    edge_embed_type: all_rij
+  wandb_tags: 'mp-type'
+  optim:
+    max_epochs: 10
+    batch_size: 256
+    eval_batch_size: 256
+  cp_data_to_tmpdir: true
+
+runs:
+  - config: fanet-is2re-all
+    note: 'batch norm after propagate Interaction'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+      batch_norm: True
+  - config: fanet-is2re-all
+    note: 'batch norm after propagate Interaction'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: att
+      batch_norm: True
+  - config: fanet-is2re-all
+    note: 'batch norm after propagate Interaction'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: local_env
+      batch_norm: True
+  - config: fanet-is2re-all
+    note: 'batch norm after propagate Interaction'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: sfarinet
+      batch_norm: True
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 471e7c227e..f0ac255805 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -345,20 +345,28 @@ def forward(self, h, edge_index, e):
         if self.mp_type == "updownscale" or self.mp_type == "updownscale_base":
             h = self.act(self.lin_down(h))  # downscale node rep.
             h = self.propagate(edge_index, x=h, W=e)  # propagate
+            if self.batch_norm:
+                h = self.act(self.graph_norm(h))
             h = self.act(self.lin_up(h))  # upscale node rep.
 
         elif self.mp_type == "att":
             h = self.lin_geom(h, edge_index, edge_attr=e)
+            if self.batch_norm:
+                h = self.act(self.graph_norm(h))
             h = self.act(self.lin_h(h))
 
         elif self.mp_type == "base_with_att":
             h = self.lin_geom(h, edge_index, edge_attr=e)  # propagate is inside
+            if self.batch_norm:
+                h = self.act(self.graph_norm(h))
             h = self.act(self.lin_h(h))
 
         elif self.mp_type == "local_env":
             chi = self.propagate(edge_index, x=h, W=e, local_env=True)
             h = self.propagate(edge_index, x=h, W=e)  # propagate
             h = h + chi
+            if self.batch_norm:
+                h = self.act(self.graph_norm(h))
             h = h = self.act(self.lin_h(h))
 
         elif self.mp_type == "updown_local_env":
@@ -366,13 +374,15 @@ def forward(self, h, edge_index, e):
             chi = self.propagate(edge_index, x=h, W=e, local_env=True)
             e = self.lin_geom(e)
             h = self.propagate(edge_index, x=h, W=e)  # propagate
+            if self.batch_norm:
+                h = self.act(self.graph_norm(h))
             h = torch.cat((h, chi), dim=1)
             h = self.lin_up(h)
 
         elif self.mp_type in {"base", "simple", "sfarinet"}:
-            if self.batch_norm:
-                h = self.graph_norm(h)
             h = self.propagate(edge_index, x=h, W=e)  # propagate
+            if self.batch_norm:
+                h = self.act(self.graph_norm(h))
             h = self.act(self.lin_h(h))
 
         else:
diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py
index 9660de9af1..0f59d42311 100644
--- a/scripts/gnn_dev.py
+++ b/scripts/gnn_dev.py
@@ -19,14 +19,16 @@
     # Customize args
     config["graph_rewiring"] = "remove-tag-0"
     config["frame_averaging"] = "2D"
-    config["fa_frames"] = "all"  # "random"
+    config["fa_frames"] = "random"  # "random"
     config["test_ri"] = True
     config["optim"] = {"max_epochs": 1}
     config["model"] = {"use_pbc": True}
     config["model"]["edge_embed_type"] = "all_rij"
-    config["model"]["mp_type"] = "att"
-    config["model"]["skip_co"] = "add"
+    config["model"]["mp_type"] = "base"
+    config["model"]["skip_co"] = False
+    config["model"]["att_heads"] = 3
     config["model"]["complex_mp"] = True
+    config["model"]["batch_norm"] = True
     # config["model"]["regress_forces"] = "direct_with_gradient_target"
 
     checkpoint_path = None

From 0f8ef1f9290eac79e7d2458f87972ac552ed7b1c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 10:52:11 -0500
Subject: [PATCH 078/273] handle multiplicative factor for Orion sampling

---
 main.py                   | 12 +++++++--
 ocpmodels/common/utils.py | 54 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index 73d75f5e15..8f83773364 100644
--- a/main.py
+++ b/main.py
@@ -21,6 +21,7 @@
 from ocpmodels.common.utils import (
     JOB_ID,
     auto_note,
+    apply_mult_factor,
     build_config,
     continue_from_slurm_job_id,
     continue_orion_exp,
@@ -76,8 +77,15 @@ def run(self, orion_exp=None):
         if distutils.is_master():
             if orion_exp:
                 orion_trial = orion_exp.suggest(1)
-                self.hparams = unflatten_dict(orion_trial.params, sep="/")
-                self.hparams["orion_hash_params"] = orion_trial.hash_params
+                self.hparams = unflatten_dict(
+                    apply_mult_factor(
+                        orion_trial.hash_params,
+                        self.trainer_config.get("orion_mult_factor"),
+                        sep="/",
+                    ),
+                    sep="/",
+                )
+                self.hparams["orion_hash_params"] = orion_trial.params
                 self.hparams["orion_unique_exp_name"] = orion_exp.name
 
         should_be_0 = distutils.get_rank()
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 8851943af5..9203819107 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -44,6 +44,60 @@
 JOB_ID = os.environ.get("SLURM_JOB_ID")
 
 
+def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."):
+    """
+    Multiplies all values of orion_hparams listed in mult_factor_dict["targets"]
+    by mult_factor_dict["value"].
+
+    eg:
+    >>> orion_hparams = {
+        "model/hidden_channels": 4,
+        "model/num_layers": 4,
+        "optim/batch_size": 4,
+        "optim/initial_lr": 0.001,
+        "frame_averaging": "",
+    }
+
+    >>> mult_factor_dict = {"value": 32, "targets": "hidden_channels, batch_size"}
+
+    >>> apply_mult_factor(orion_hparams, mult_factor_dict, sep="/")
+    {
+        "model/hidden_channels": 128,
+        "model/num_layers": 4,
+        "optim/batch_size": 128,
+        "optim/initial_lr": 0.001,
+        "frame_averaging": ""
+    }
+
+    Args:
+        orion_hparams (_type_): _description_
+        mult_factor_dict (_type_): _description_
+        sep (str, optional): _description_. Defaults to ".".
+
+    Returns:
+        _type_: _description_
+    """
+    if not mult_factor_dict:
+        return orion_hparams
+    if not isinstance(mult_factor_dict, dict):
+        print(
+            f">>> Warning: ignoring apply_mult_factor, not a dict: {mult_factor_dict}."
+        )
+    if "value" not in mult_factor_dict or "targets" not in mult_factor_dict:
+        print(
+            ">>> Warning: ignoring apply_mult_factor, "
+            + " missing 'value' or 'targets' keys: {}.".format(mult_factor_dict)
+        )
+    value, targets = mult_factor_dict["value"], mult_factor_dict["targets"]
+    targets = set([t.strip() for t in targets.split(",")])
+    updated_hparams = copy.deepcopy(orion_hparams)
+    for k, v in orion_hparams.items():
+        target = k.split(sep)[-1]
+        if target in targets:
+            updated_hparams[k] = v * value
+    return updated_hparams
+
+
 def load_orion_exp(args):
     exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text())
 

From 9a578f8fed34da45317378b7c4bdbacea1c4786c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 10:52:33 -0500
Subject: [PATCH 079/273] refactor prints to `print_status`

---
 ocpmodels/common/exp_manager.py | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 1bbe5dd1b0..858250dfdb 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -61,10 +61,13 @@ def __init__(
             [p.name for runs in self.trial_hparams_to_rundirs.values() for p in runs]
         )
         print("\n")
-        print("{:31} : {:4} ".format("Trials in experiment", len(self.trials)))
-        print("{:31} : {:4}".format("Total expected trials", self.total_budgets))
+        self.print_status()
+
+    def print_status(self):
+        print("{:32} : {:4} ".format("Trials in experiment", len(self.trials)))
+        print("{:32} : {:4}".format("Total expected trials", self.total_budgets))
         print(
-            "{:31} : {:4} ".format(
+            "{:32} : {:4} ".format(
                 "Trials status",
                 " ".join(
                     [
@@ -75,7 +78,7 @@ def __init__(
             )
         )
         print(
-            "{:31} : {}".format(
+            "{:32} : {}".format(
                 "Trial level(=rung) distribution",
                 " ".join(
                     [
@@ -88,18 +91,18 @@ def __init__(
             )
         )
         print(
-            "{:31} : {:4}".format(
+            "{:32} : {:4}".format(
                 "Existing unique HP sets executed", len(self.trial_hparams_to_rundirs)
             )
         )
         print(
-            "{:31} : {:4}".format(
+            "{:32} : {:4}".format(
                 "Total existing trial run dirs",
                 sum(len(v) for v in self.trial_hparams_to_rundirs.values()),
             )
         )
-        print("{:31} : {:4}".format("Existing wandb runs", len(self.wandb_runs)))
-        print("{:31} : {}".format("Algorithm's budgets", str(self.budgets)))
+        print("{:32} : {:4}".format("Existing wandb runs", len(self.wandb_runs)))
+        print("{:32} : {}".format("Algorithm's budgets", str(self.budgets)))
         sq = set(
             [
                 j.strip()
@@ -113,13 +116,13 @@ def __init__(
             set([j.parent.name for j in rundir.glob(f"*/{self.name}.exp")]) & sq
         ) - running
         print(
-            "{:31} : {}".format(
+            "{:32} : {}".format(
                 "Jobs currently running:",
                 f"{len(running)} " + " ".join(running),
             )
         )
         print(
-            "{:31} : {}".format(
+            "{:32} : {}".format(
                 "Jobs currently waiting:",
                 f"{len(waiting)} " + " ".join(waiting),
             )
@@ -155,7 +158,7 @@ def get_reserved_wandb_runs(self):
         return reserved
 
     def print_wandb_query(self):
-        print("WandB runs query:\n" + "(" + "|".join(self.job_ids) + ")")
+        print(f"{'WandB runs query:':32}\n" + "(" + "|".join(self.job_ids) + ")")
 
     @classmethod
     def help(self):

From fc904290d7c8bc5f8686f6ef995677005a359b49 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 10:53:49 -0500
Subject: [PATCH 080/273] update exps

---
 configs/exps/debug/orion.yaml                |  8 +--
 configs/exps/icml/is2re-10k/fanet-orion.yaml | 60 ++++++++++++++++++++
 configs/exps/qm7x/schnet-from-spooky.yaml    | 17 +++++-
 3 files changed, 78 insertions(+), 7 deletions(-)
 create mode 100644 configs/exps/icml/is2re-10k/fanet-orion.yaml

diff --git a/configs/exps/debug/orion.yaml b/configs/exps/debug/orion.yaml
index 93eb5f5de4..7c7a528837 100644
--- a/configs/exps/debug/orion.yaml
+++ b/configs/exps/debug/orion.yaml
@@ -4,7 +4,7 @@ job:
   cpus: 4
   gres: gpu:16gb:1
   time: 1:00:00
-  partition: main
+  partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
   env: ocp-a100
 
@@ -41,10 +41,10 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: ocp-qm9-orion-debug-v1.0.0
+  unique_exp_name: ocp-qm9-orion-debug-v1.0.1
 
   space:
-    optim/max_steps: fidelity(1e5, 1e6, base=3)
+    optim/max_steps: fidelity(1e3, 1e4, base=3)
     optim/batch_size: uniform(32, 128, discrete=True)
     optim/lr_initial: loguniform(1e-5, 5e-3, precision=2)
     model/num_gaussians: uniform(16, 200, discrete=True)
@@ -56,5 +56,5 @@ orion:
   algorithms:
     asha:
       seed: 123
-      num_rungs: 5
+      num_rungs: 4
       num_brackets: 1
diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml
new file mode 100644
index 0000000000..cf89e45531
--- /dev/null
+++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml
@@ -0,0 +1,60 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 24GB
+  cpus: 4
+  gres: gpu:16gb:1
+  time: 2:00:00
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
+  env: ocp-a100
+
+default:
+  wandb_project: ocp-4
+  config: fanet-is2re-10k
+  mode: train
+  test_ri: true
+  wandb_tags: is2re-10k, orion
+  cp_data_to_tmpdir: true
+  graph_rewiring: remove-tag-0
+  optim:
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    decay_steps: max_steps
+    scheduler: LinearWarmupCosineAnnealingLR
+    max_epochs: -1
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, batch_norm
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: batch_size, num_gaussians, hidden_channels, num_filters, num_interactions, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 20
+
+  unique_exp_name: fanet-is2re-10k-v1.0.0
+
+  space:
+    optim/max_epochs: fidelity(10, 50, base=4)
+    optim/batch_size: uniform(1, 16, discrete=True)
+    optim/lr_initial: loguniform(1e-4, 5e-3, precision=2)
+    model/num_gaussians: uniform(20, 150, discrete=True)
+    model/hidden_channels: uniform(1, 16, discrete=True)
+    model/num_filters: uniform(1, 16, discrete=True)
+    model/num_interactions: uniform(1, 7, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/batch_norm: choices([True, False])
+    model/pg_hidden_channels: uniform(0, 3, discrete=True)
+    model/phys_hidden_channels: uniform(0, 3, discrete=True)
+    model/tag_hidden_channels: uniform(0, 3, discrete=True)
+    model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
+    model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"])
+    model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 5
+      num_brackets: 1
diff --git a/configs/exps/qm7x/schnet-from-spooky.yaml b/configs/exps/qm7x/schnet-from-spooky.yaml
index 6f2164c4d6..5c8dce6fe9 100644
--- a/configs/exps/qm7x/schnet-from-spooky.yaml
+++ b/configs/exps/qm7x/schnet-from-spooky.yaml
@@ -26,14 +26,15 @@ default:
     optim: batch_size, lr_initial
   optim:
     batch_size: 10
-    warmup_steps: 1000
+    warmup_steps: 3000
     lr_initial: 0.0001
     # parameters EMA
     # ema_decay: 0.999
-    decay_steps: 750000
+    decay_steps: max_steps
     scheduler:
     decay_rate: 0.01
-    max_steps: 1000000
+    max_steps: 2000000
+    eval_every: 50000
   model:
     hidden_channels: 128
     num_filters: 128
@@ -46,6 +47,16 @@ runs:
       ema_decay: 0.999
   - optim:
       scheduler: LinearWarmupCosineAnnealingLR
+    model:
+      cutoff: 6.0
+  - optim:
+      scheduler: LinearWarmupCosineAnnealingLR
+    model:
+      num_gaussians: 100
+  - optim:
+      scheduler: LinearWarmupCosineAnnealingLR
+    model:
+      num_filters: 256
   - optim:
       ema_decay: 0.999
       scheduler: LinearWarmupCosineAnnealingLR

From b48b9a14c6e79af83ee50ec3d2a0b7b68ae725f4 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 10:57:04 -0500
Subject: [PATCH 081/273] create orion search yamls *after* confirm

---
 launch_exp.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index e40199df7a..5e0f1b0d04 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -180,9 +180,6 @@ def get_args_or_exp(key, args, exp):
             exp["unique_exp_name"] = unique_exp_name
 
         search_path = orion_base / "search-spaces" / f"{ts}-{unique_exp_name}.yaml"
-        search_path.parent.mkdir(exist_ok=True, parents=True)
-        assert not search_path.exists()
-        search_path.write_text(dump(exp["orion"]))
         runs = [
             {
                 "orion_exp_config_path": str(search_path),
@@ -230,6 +227,11 @@ def get_args_or_exp(key, args, exp):
 
     if confirm == "y":
         try:
+            if "orion" in exp:
+                search_path.parent.mkdir(exist_ok=True, parents=True)
+                assert not search_path.exists()
+                search_path.write_text(dump(exp["orion"]))
+
             outputs = []
             for c, command in enumerate(commands):
                 print(f"Launching job {c:3}", end="\r")

From ac4b3ce0e8cff28acbe48a22fe3c646b9227cad6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 11:08:02 -0500
Subject: [PATCH 082/273] typo in run

---
 main.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.py b/main.py
index 8f83773364..5e4f99c427 100644
--- a/main.py
+++ b/main.py
@@ -79,13 +79,13 @@ def run(self, orion_exp=None):
                 orion_trial = orion_exp.suggest(1)
                 self.hparams = unflatten_dict(
                     apply_mult_factor(
-                        orion_trial.hash_params,
+                        orion_trial.params,
                         self.trainer_config.get("orion_mult_factor"),
                         sep="/",
                     ),
                     sep="/",
                 )
-                self.hparams["orion_hash_params"] = orion_trial.params
+                self.hparams["orion_hash_params"] = orion_trial.hash_params
                 self.hparams["orion_unique_exp_name"] = orion_exp.name
 
         should_be_0 = distutils.get_rank()

From a0a9c0089c0cf672b015a2d341a32ebd0ef244ab Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 11:28:56 -0500
Subject: [PATCH 083/273] allow "" for fa_frames

---
 ocpmodels/datasets/data_transforms.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocpmodels/datasets/data_transforms.py b/ocpmodels/datasets/data_transforms.py
index 1e85fff5c0..64556a0038 100644
--- a/ocpmodels/datasets/data_transforms.py
+++ b/ocpmodels/datasets/data_transforms.py
@@ -44,6 +44,7 @@ def __init__(self, fa_type=None, fa_frames=None):
             "DA",
         }
         assert self.fa_frames in {
+            "",  # equivalent to random, necessary still for sweeps
             "random",
             "det",
             "all",

From b70a9ee54b5dc0f1e1e608e8d211791681de7ce1 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 11:29:03 -0500
Subject: [PATCH 084/273] add watch mode

---
 ocpmodels/common/exp_manager.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 858250dfdb..75d7566975 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -6,6 +6,8 @@
 from minydra import resolved_args
 import os
 import sys
+import time
+from datetime import datetime
 
 rundir = Path(os.environ["SCRATCH"]) / "ocp" / "runs"
 
@@ -216,6 +218,7 @@ def help(self):
             Path(__file__).resolve().parent.parent.parent
             / "data/orion/storage/orion_db.pkl"
         ),
+        "watch": -1,
     }
     args = resolved_args(defaults=defaults)
     if args.help:
@@ -254,3 +257,30 @@ def help(self):
     m.print_wandb_query()
     exp_df = m.exp.to_pandas()
     reserved_wandbs = m.get_reserved_wandb_runs()
+
+    if args.watch and args.watch > 0:
+        if args.watch < 15:
+            print("Cannot watch to often, setting to 15 seconds.")
+            args.watch = 15
+        try:
+            print("👀 Watching for exp status every every", args.watch, "seconds.")
+            while True:
+                time.sleep(args.watch)
+                print()
+                print("=" * 30)
+                print("=" * 30)
+                print()
+                print(
+                    "💃 Status of experiment",
+                    f"'{args.name}' and wandb entity/project '{args.wandb_path}' @",
+                    str(datetime.now()).split(".")[0],
+                )
+                print()
+                m = Manager(
+                    name=args.name,
+                    wandb_path=args.wandb_path,
+                    orion_db_path=args.orion_db_path,
+                )
+        except KeyboardInterrupt:
+            print("👋 Exiting.")
+            sys.exit(0)

From 721521181fac2fd7374459c8851db2e7955f9a88 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 11:37:16 -0500
Subject: [PATCH 085/273] add clean dirs command output

---
 launch_exp.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/launch_exp.py b/launch_exp.py
index 5e0f1b0d04..b7495e02f4 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -17,6 +17,12 @@ def util_strings(jobs, yaml_comments=False):
     s = "All jobs launched: " + ", ".join(jobs)
     s += "\nCancel experiment: scancel " + " ".join(jobs)
     s += "\nWandB query for dashboard: (" + "|".join(jobs) + ")"
+    s += (
+        "\n Delete experiment run dirs: "
+        + 'ocp_run_dirs="$SCRATCH/ocp/runs; for jid in '
+        + " ".join(jobs)
+        + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done"'
+    )
     if yaml_comments:
         s = "\n".join(["# " + line for line in s.splitlines()])
     return s
@@ -173,6 +179,8 @@ def get_args_or_exp(key, args, exp):
     if "orion" in exp:
         orion_base = ROOT / "data" / "orion"
         assert "runs" not in exp, "Cannot use both Orion and runs"
+        assert "space" in exp["orion"], "Must specify orion.space"
+        assert "algorithms" in exp["orion"], "Must specify orion.algorithms"
 
         n_jobs = get_args_or_exp("n_jobs", args, exp["orion"])
         unique_exp_name = get_args_or_exp("unique_exp_name", args, exp["orion"])

From 9e0325f1aba103590c9c607b6f98884f4c501ddc Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 11:38:00 -0500
Subject: [PATCH 086/273] typo in print

---
 launch_exp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index b7495e02f4..81547103c7 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -19,9 +19,9 @@ def util_strings(jobs, yaml_comments=False):
     s += "\nWandB query for dashboard: (" + "|".join(jobs) + ")"
     s += (
         "\n Delete experiment run dirs: "
-        + 'ocp_run_dirs="$SCRATCH/ocp/runs; for jid in '
+        + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
         + " ".join(jobs)
-        + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done"'
+        + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done'
     )
     if yaml_comments:
         s = "\n".join(["# " + line for line in s.splitlines()])

From 2fb9ec6c1803558672dd39526e6320c60045abdc Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 12:15:36 -0500
Subject: [PATCH 087/273] auto `max_steps` if `max_epochs`

---
 ocpmodels/trainers/base_trainer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index b5a7b71001..a324f73787 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -250,6 +250,11 @@ def load_datasets(self):
                         f"dataset length ({len(self.datasets[split])}),",
                         f"and batch_size ({batch_size})\n",
                     )
+                else:
+                    self.config["optim"]["max_steps"] = int(
+                        self.config["optim"]["max_epochs"]
+                        * (len(self.datasets[split]) / batch_size)
+                    )
 
             self.samplers[split] = self.get_sampler(
                 self.datasets[split], batch_size, shuffle=shuffle

From c0a32e933ac253fbe580133029d7cfa2f16e38cc Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 14:25:15 -0500
Subject: [PATCH 088/273] batch_norm to graph_norm (& updown GN fix)

---
 configs/exps/gnn/batch_norm.yaml             |  8 +++---
 configs/exps/gnn/edge_embed_type.yaml        |  2 +-
 configs/exps/gnn/mp_type_3.yaml              |  4 +--
 configs/exps/icml/is2re-10k/fanet-orion.yaml | 28 +++++++++++---------
 configs/models/fanet.yaml                    |  2 +-
 launch_exp.py                                |  9 ++++---
 ocpmodels/models/fanet.py                    | 26 +++++++++---------
 scripts/gnn_dev.py                           |  2 +-
 8 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/configs/exps/gnn/batch_norm.yaml b/configs/exps/gnn/batch_norm.yaml
index 4655fa3a9b..df99a320eb 100644
--- a/configs/exps/gnn/batch_norm.yaml
+++ b/configs/exps/gnn/batch_norm.yaml
@@ -29,25 +29,25 @@ runs:
     fa_frames: se3-random
     model:
       mp_type: base
-      batch_norm: True
+      graph_norm: True
   - config: fanet-is2re-all
     note: 'batch norm after propagate Interaction'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       mp_type: att
-      batch_norm: True
+      graph_norm: True
   - config: fanet-is2re-all
     note: 'batch norm after propagate Interaction'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       mp_type: local_env
-      batch_norm: True
+      graph_norm: True
   - config: fanet-is2re-all
     note: 'batch norm after propagate Interaction'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       mp_type: sfarinet
-      batch_norm: True
+      graph_norm: True
diff --git a/configs/exps/gnn/edge_embed_type.yaml b/configs/exps/gnn/edge_embed_type.yaml
index 9354db6d8a..34793b8c4e 100644
--- a/configs/exps/gnn/edge_embed_type.yaml
+++ b/configs/exps/gnn/edge_embed_type.yaml
@@ -59,5 +59,5 @@ runs:
       mp_type: base
       skip_co: "concat"
       complex_mp: true
-      batch_norm: true
+      graph_norm: true
       second_layer_mlp: true
\ No newline at end of file
diff --git a/configs/exps/gnn/mp_type_3.yaml b/configs/exps/gnn/mp_type_3.yaml
index 8ba039780b..ec14d42dcd 100644
--- a/configs/exps/gnn/mp_type_3.yaml
+++ b/configs/exps/gnn/mp_type_3.yaml
@@ -133,14 +133,14 @@ runs:
     fa_frames: se3-random
     model:
       mp_type: sfarinet
-      batch_norm: true
+      graph_norm: true
   - config: fanet-is2re-all
     note: 'base_updownscale'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
       mp_type: base_updownscale
-      batch_norm: true
+      graph_norm: true
   - config: fanet-is2re-all
     note: 'simple bigger layers'
     frame_averaging: 2D
diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml
index cf89e45531..f073505c5e 100644
--- a/configs/exps/icml/is2re-10k/fanet-orion.yaml
+++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml
@@ -3,7 +3,7 @@ job:
   mem: 24GB
   cpus: 4
   gres: gpu:16gb:1
-  time: 2:00:00
+  time: 30:00
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
   env: ocp-a100
@@ -17,42 +17,44 @@ default:
   cp_data_to_tmpdir: true
   graph_rewiring: remove-tag-0
   optim:
-    warmup_steps: 3000
+    warmup_steps: 500
     # parameters EMA
     ema_decay: 0.999
     decay_steps: max_steps
     scheduler: LinearWarmupCosineAnnealingLR
     max_epochs: -1
   note:
-    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, batch_norm
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
     optim: batch_size, lr_initial
     _root_: frame_averaging, fa_frames
   orion_mult_factor:
     value: 32
-    targets: batch_size, num_gaussians, hidden_channels, num_filters, num_interactions, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+    targets: batch_size, hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-is2re-10k-v1.0.0
+  unique_exp_name: fanet-is2re-10k-v1.1.0
 
   space:
-    optim/max_epochs: fidelity(10, 50, base=4)
-    optim/batch_size: uniform(1, 16, discrete=True)
+    optim/max_epochs: fidelity(20, 100, base=4)
+    optim/batch_size: uniform(1, 10, discrete=True)
     optim/lr_initial: loguniform(1e-4, 5e-3, precision=2)
-    model/num_gaussians: uniform(20, 150, discrete=True)
+    # model/graph_norm: choices([True, False])
+    model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"])
+    model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
     model/hidden_channels: uniform(1, 16, discrete=True)
+    model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
     model/num_filters: uniform(1, 16, discrete=True)
+    model/num_gaussians: uniform(20, 150, discrete=True)
     model/num_interactions: uniform(1, 7, discrete=True)
-    model/phys_embeds: choices([True, False])
-    model/batch_norm: choices([True, False])
     model/pg_hidden_channels: uniform(0, 3, discrete=True)
+    model/phys_embeds: choices([True, False])
     model/phys_hidden_channels: uniform(0, 3, discrete=True)
     model/tag_hidden_channels: uniform(0, 3, discrete=True)
-    model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
-    model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"])
-    model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
+    frame_averaging: choices(["", "2D", "3D", "DA"])
+    fa_frames: choices(["", "random", "det", "all", "se3-all", "se3-random", "se3-det", "multiple", "se3-multiple"])
   algorithms:
     asha:
       seed: 123
diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml
index 7822609aac..0c789467b3 100644
--- a/configs/models/fanet.yaml
+++ b/configs/models/fanet.yaml
@@ -21,7 +21,7 @@ default:
     complex_mp: False
     edge_embed_type: rij # {'rij','all_rij','sh', 'all'})
     mp_type: base # {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'}
-    batch_norm: False  # bool
+    graph_norm: False  # bool
     att_heads: 1  # int
     force_decoder_type: "mlp" # can be {"" or "simple"} | only used if regress_forces is True
     force_decoder_model_config:
diff --git a/launch_exp.py b/launch_exp.py
index 81547103c7..e146f816dd 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -15,13 +15,14 @@
 
 def util_strings(jobs, yaml_comments=False):
     s = "All jobs launched: " + ", ".join(jobs)
-    s += "\nCancel experiment: scancel " + " ".join(jobs)
-    s += "\nWandB query for dashboard: (" + "|".join(jobs) + ")"
+    s += "\nCancel experiment:\n  $ scancel " + " ".join(jobs)
+    s += "\nWandB query for dashboard:\n  (" + "|".join(jobs) + ")"
     s += (
-        "\n Delete experiment run dirs: "
+        "\nDelete experiment run dirs:\n  $ "
         + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
         + " ".join(jobs)
-        + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done'
+        + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid";'
+        + " done; unset $ocp_run_dirs; unset $jid"
     )
     if yaml_comments:
         s = "\n".join(["# " + line for line in s.splitlines()])
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index f0ac255805..092f1d8d3c 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -239,16 +239,18 @@ def __init__(
         mp_type,
         complex_mp,
         att_heads,
-        batch_norm,
+        graph_norm,
     ):
         super(InteractionBlock, self).__init__()
         self.act = act
         self.mp_type = mp_type
         self.hidden_channels = hidden_channels
         self.complex_mp = complex_mp
-        self.batch_norm = batch_norm
-        if batch_norm:
-            self.graph_norm = GraphNorm(hidden_channels)
+        self.graph_norm = graph_norm
+        if graph_norm:
+            self.graph_norm = GraphNorm(
+                hidden_channels if "updown" not in self.mp_type else num_filters
+            )
 
         if self.mp_type == "simple":
             self.lin_geom = nn.Linear(num_filters, hidden_channels)
@@ -345,19 +347,19 @@ def forward(self, h, edge_index, e):
         if self.mp_type == "updownscale" or self.mp_type == "updownscale_base":
             h = self.act(self.lin_down(h))  # downscale node rep.
             h = self.propagate(edge_index, x=h, W=e)  # propagate
-            if self.batch_norm:
+            if self.graph_norm:
                 h = self.act(self.graph_norm(h))
             h = self.act(self.lin_up(h))  # upscale node rep.
 
         elif self.mp_type == "att":
             h = self.lin_geom(h, edge_index, edge_attr=e)
-            if self.batch_norm:
+            if self.graph_norm:
                 h = self.act(self.graph_norm(h))
             h = self.act(self.lin_h(h))
 
         elif self.mp_type == "base_with_att":
             h = self.lin_geom(h, edge_index, edge_attr=e)  # propagate is inside
-            if self.batch_norm:
+            if self.graph_norm:
                 h = self.act(self.graph_norm(h))
             h = self.act(self.lin_h(h))
 
@@ -365,7 +367,7 @@ def forward(self, h, edge_index, e):
             chi = self.propagate(edge_index, x=h, W=e, local_env=True)
             h = self.propagate(edge_index, x=h, W=e)  # propagate
             h = h + chi
-            if self.batch_norm:
+            if self.graph_norm:
                 h = self.act(self.graph_norm(h))
             h = h = self.act(self.lin_h(h))
 
@@ -374,14 +376,14 @@ def forward(self, h, edge_index, e):
             chi = self.propagate(edge_index, x=h, W=e, local_env=True)
             e = self.lin_geom(e)
             h = self.propagate(edge_index, x=h, W=e)  # propagate
-            if self.batch_norm:
+            if self.graph_norm:
                 h = self.act(self.graph_norm(h))
             h = torch.cat((h, chi), dim=1)
             h = self.lin_up(h)
 
         elif self.mp_type in {"base", "simple", "sfarinet"}:
             h = self.propagate(edge_index, x=h, W=e)  # propagate
-            if self.batch_norm:
+            if self.graph_norm:
                 h = self.act(self.graph_norm(h))
             h = self.act(self.lin_h(h))
 
@@ -502,7 +504,7 @@ class FANet(BaseModel):
         mp_type (str, in {'base', 'simple', 'updownscale', 'att', 'base_with_att', 'local_env'
             'updownscale_base', 'updownscale', 'updown_local_env', 'sfarinet'}}):
             specificies the MP of the interaction block.
-        batch_norm (bool): whether to apply batch norm after every linear layer.
+        graph_norm (bool): whether to apply batch norm after every linear layer.
         complex_mp (bool); whether to add a second layer MLP at the end of each Interaction
     """
 
@@ -558,7 +560,7 @@ def __init__(self, **kwargs):
                     kwargs["mp_type"],
                     kwargs["complex_mp"],
                     kwargs["att_heads"],
-                    kwargs["batch_norm"],
+                    kwargs["graph_norm"],
                 )
                 for _ in range(kwargs["num_interactions"])
             ]
diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py
index 0f59d42311..617e54cc18 100644
--- a/scripts/gnn_dev.py
+++ b/scripts/gnn_dev.py
@@ -28,7 +28,7 @@
     config["model"]["skip_co"] = False
     config["model"]["att_heads"] = 3
     config["model"]["complex_mp"] = True
-    config["model"]["batch_norm"] = True
+    config["model"]["graph_norm"] = True
     # config["model"]["regress_forces"] = "direct_with_gradient_target"
 
     checkpoint_path = None

From a057d58c8e6b07397f0aa991da4916fda4d98e19 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 14:32:16 -0500
Subject: [PATCH 089/273] add max samples option

---
 ocpmodels/trainers/base_trainer.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index a324f73787..357c6e1f40 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -222,6 +222,7 @@ def load_datasets(self):
         transform = get_transforms(self.config)  # TODO: train/val/test behavior
         batch_size = self.config["optim"]["batch_size"]
         max_steps = self.config["optim"].get("max_steps", -1)
+        max_samples = self.config["optim"].get("max_samples", -1)
 
         for split, ds_conf in self.config["dataset"].items():
             if split == "default_val":
@@ -234,11 +235,28 @@ def load_datasets(self):
             shuffle = False
             if split == "train":
                 shuffle = True
-                if max_steps > 0:
+                if max_samples > 0:
                     if self.config["optim"].get("max_epochs", -1) > 0:
                         print(
-                            "WARNING: Both max_steps and max_epochs are set.",
-                            "Using max_steps.",
+                            "\nWARNING: Both max_samples and max_epochs are set.",
+                            "Using max_samples.",
+                        )
+                    if self.config["optim"].get("max_steps", -1) > 0:
+                        print(
+                            "WARNING: Both max_samples and max_steps are set.",
+                            "Using max_samples.\n",
+                        )
+                    self.config["optim"]["max_epochs"] = int(
+                        np.ceil(max_samples / len(self.datasets[split]))
+                    )
+                    self.config["optim"]["max_steps"] = int(
+                        np.ceil(max_samples / batch_size)
+                    )
+                elif max_steps > 0:
+                    if self.config["optim"].get("max_epochs", -1) > 0:
+                        print(
+                            "\nWARNING: Both max_steps and max_epochs are set.",
+                            "Using max_steps.\n",
                         )
                     self.config["optim"]["max_epochs"] = int(
                         np.ceil(max_steps / (len(self.datasets[split]) / batch_size))

From bdc40b79ebf084af84a11fe22f934fec9678b526 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 14:55:40 -0500
Subject: [PATCH 090/273] update exp

---
 configs/exps/icml/is2re-10k/fanet-orion.yaml | 14 +++++++-------
 ocpmodels/trainers/single_trainer.py         |  7 +++----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml
index f073505c5e..da605c5b83 100644
--- a/configs/exps/icml/is2re-10k/fanet-orion.yaml
+++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml
@@ -14,34 +14,34 @@ default:
   mode: train
   test_ri: true
   wandb_tags: is2re-10k, orion
-  cp_data_to_tmpdir: true
+  cp_data_to_tmpdir: false
   graph_rewiring: remove-tag-0
+  log_train_every: 20
   optim:
-    warmup_steps: 500
+    warmup_steps: 100
     # parameters EMA
     ema_decay: 0.999
     decay_steps: max_steps
     scheduler: LinearWarmupCosineAnnealingLR
-    max_epochs: -1
+    batch_size: 256
   note:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
     optim: batch_size, lr_initial
     _root_: frame_averaging, fa_frames
   orion_mult_factor:
     value: 32
-    targets: batch_size, hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-is2re-10k-v1.1.0
+  unique_exp_name: fanet-is2re-10k-v1.1.1
 
   space:
     optim/max_epochs: fidelity(20, 100, base=4)
-    optim/batch_size: uniform(1, 10, discrete=True)
     optim/lr_initial: loguniform(1e-4, 5e-3, precision=2)
-    # model/graph_norm: choices([True, False])
+    model/graph_norm: choices([True, False])
     model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"])
     model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
     model/hidden_channels: uniform(1, 16, discrete=True)
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 0c7109bc25..d964ffeba9 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -203,9 +203,6 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         first_eval = True
         log_train_every = self.config["log_train_every"]
 
-        print(f"Logging  train metrics every {log_train_every} steps")
-        print(f"Printing train metrics every {self.config['print_every']} steps")
-
         # Calculate start_epoch from step instead of loading the epoch number
         # to prevent inconsistencies due to different batch size in checkpoint.
         start_epoch = self.step // n_train
@@ -214,7 +211,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         model_run_time = 0
 
         if not self.silent:
-            print(f"--- 🔄 Beginning of Training @ {self.now}---")
+            print(f"--- 🔄 Beginning of Training @ {self.now}---\n")
+            print(f"Logging  train metrics every {log_train_every} steps")
+            print(f"Printing train metrics every {self.config['print_every']} steps")
 
         for epoch_int in range(start_epoch, self.config["optim"]["max_epochs"]):
 

From 18cf7337d912089cbd67c3f3b74d5831476c6499 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 15:26:41 -0500
Subject: [PATCH 091/273] fix multiple fa frames

---
 ocpmodels/preprocessing/frame_averaging.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ocpmodels/preprocessing/frame_averaging.py b/ocpmodels/preprocessing/frame_averaging.py
index 198cff6933..0dcf856c25 100644
--- a/ocpmodels/preprocessing/frame_averaging.py
+++ b/ocpmodels/preprocessing/frame_averaging.py
@@ -84,6 +84,7 @@ def all_frames(eigenvec, pos, cell, fa_frames="random", pos_3D=None, det_index=0
             index = random.randint(0, len(all_fa_pos) - 1)
             return [all_fa_pos[index]], [all_cell[index]], [all_rots[index]]
         if index.sum() == 1:
+            _, index = torch.max(index, dim=0)
             return [all_fa_pos[index]], [all_cell[index]], [all_rots[index]]
         else:
             all_fa_pos = [a for a, b in zip(all_fa_pos, index) if b]

From dcb3bf9a510a01972db2d07a212c173bb7d79503 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 15:36:37 -0500
Subject: [PATCH 092/273] fix `updownscale_base`

---
 configs/exps/gnn/mp_type_3.yaml | 4 ++--
 ocpmodels/models/fanet.py       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/exps/gnn/mp_type_3.yaml b/configs/exps/gnn/mp_type_3.yaml
index ec14d42dcd..f40f275ea8 100644
--- a/configs/exps/gnn/mp_type_3.yaml
+++ b/configs/exps/gnn/mp_type_3.yaml
@@ -135,11 +135,11 @@ runs:
       mp_type: sfarinet
       graph_norm: true
   - config: fanet-is2re-all
-    note: 'base_updownscale'
+    note: 'updownscale_base'
     frame_averaging: 2D
     fa_frames: se3-random
     model:
-      mp_type: base_updownscale
+      mp_type: updownscale_base
       graph_norm: true
   - config: fanet-is2re-all
     note: 'simple bigger layers'
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 092f1d8d3c..d1577e9ece 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -318,7 +318,7 @@ def reset_parameters(self):
         if self.complex_mp:
             nn.init.xavier_uniform_(self.other_mlp.weight)
             self.other_mlp.bias.data.fill_(0)
-        if self.mp_type in {"updownscale", "base_updownscale", "updown_local_env"}:
+        if self.mp_type in {"updownscale", "updownscale_base", "updown_local_env"}:
             nn.init.xavier_uniform_(self.lin_up.weight)
             self.lin_up.bias.data.fill_(0)
             nn.init.xavier_uniform_(self.lin_down.weight)

From dcfe8550f8ac2d6bc5709a261da646437de563e0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 15:36:46 -0500
Subject: [PATCH 093/273] sort printed jobs

---
 ocpmodels/common/exp_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 75d7566975..c5321075df 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -120,13 +120,13 @@ def print_status(self):
         print(
             "{:32} : {}".format(
                 "Jobs currently running:",
-                f"{len(running)} " + " ".join(running),
+                f"{len(running)} " + " ".join(sorted(running)),
             )
         )
         print(
             "{:32} : {}".format(
                 "Jobs currently waiting:",
-                f"{len(waiting)} " + " ".join(waiting),
+                f"{len(waiting)} " + " ".join(sorted(waiting)),
             )
         )
 

From 784a2a1e2cbba587635221cc8cfd4f1949db9053 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 18:11:29 -0500
Subject: [PATCH 094/273] update db path

---
 configs/exps/icml/is2re-10k/fanet-orion.yaml | 14 +++++++-------
 launch_exp.py                                |  3 +--
 ocpmodels/common/utils.py                    |  6 ++++--
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml
index da605c5b83..8d770b152c 100644
--- a/configs/exps/icml/is2re-10k/fanet-orion.yaml
+++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml
@@ -1,8 +1,8 @@
 # more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
 job:
-  mem: 24GB
+  mem: 8GB
   cpus: 4
-  gres: gpu:16gb:1
+  gres: gpu:1
   time: 30:00
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
@@ -36,7 +36,7 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-is2re-10k-v1.1.1
+  unique_exp_name: fanet-is2re-10k-v1.2.0
 
   space:
     optim/max_epochs: fidelity(20, 100, base=4)
@@ -44,15 +44,15 @@ orion:
     model/graph_norm: choices([True, False])
     model/edge_embed_type: choices(["rij", "all_rij", "sh", "all"])
     model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
-    model/hidden_channels: uniform(1, 16, discrete=True)
+    model/hidden_channels: uniform(4, 16, discrete=True)
     model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
     model/num_filters: uniform(1, 16, discrete=True)
     model/num_gaussians: uniform(20, 150, discrete=True)
     model/num_interactions: uniform(1, 7, discrete=True)
-    model/pg_hidden_channels: uniform(0, 3, discrete=True)
+    model/pg_hidden_channels: uniform(0, 2, discrete=True)
     model/phys_embeds: choices([True, False])
-    model/phys_hidden_channels: uniform(0, 3, discrete=True)
-    model/tag_hidden_channels: uniform(0, 3, discrete=True)
+    model/phys_hidden_channels: uniform(0, 2, discrete=True)
+    model/tag_hidden_channels: uniform(0, 2, discrete=True)
     frame_averaging: choices(["", "2D", "3D", "DA"])
     fa_frames: choices(["", "random", "det", "all", "se3-all", "se3-random", "se3-det", "multiple", "se3-multiple"])
   algorithms:
diff --git a/launch_exp.py b/launch_exp.py
index e146f816dd..da3d16c6f9 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -21,8 +21,7 @@ def util_strings(jobs, yaml_comments=False):
         "\nDelete experiment run dirs:\n  $ "
         + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
         + " ".join(jobs)
-        + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid";'
-        + " done; unset $ocp_run_dirs; unset $jid"
+        + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;'
     )
     if yaml_comments:
         s = "\n".join(["# " + line for line in s.splitlines()])
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 9203819107..73d2d00925 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -106,7 +106,9 @@ def load_orion_exp(args):
     ), "Must provide orion_unique_exp_name in the command-line or the config file."
 
     print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}")
-    db_path = ROOT / "data" / "orion" / "storage" / "orion_db.pkl"
+    exp_name = args.orion_unique_exp_name or exp_config["unique_exp_name"]
+    db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."])
+    db_path = ROOT / "data" / "orion" / "storage" / f"{db_id}_db.pkl"
     db_path.parent.mkdir(parents=True, exist_ok=True)
     experiment = build_experiment(
         storage={
@@ -115,7 +117,7 @@ def load_orion_exp(args):
                 "type": "pickleddb",
             }
         },
-        name=args.orion_unique_exp_name or exp_config["unique_exp_name"],
+        name=exp_name,
         space=exp_config["space"],
         algorithms=exp_config["algorithms"],
     )

From 578100ebf30c23e6509e7269032a0338ecc09b5f Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Fri, 13 Jan 2023 18:40:34 -0500
Subject: [PATCH 095/273] fix attention

---
 ocpmodels/models/fanet.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index d1577e9ece..1d2a2a3bc2 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -122,7 +122,7 @@ def __init__(
         elif self.edge_embed_type == "sh":
             self.lin_e1 = Linear(15, num_filters)
         elif self.edge_embed_type == "all":
-            self.lin_e1 = Linear(15 + num_gaussians, num_filters)
+            self.lin_e1 = Linear(18 + num_gaussians, num_filters)
         else:
             raise ValueError("edge_embedding_type does not exist")
 
@@ -276,7 +276,7 @@ def __init__(
                 hidden_channels,
                 hidden_channels,
                 heads=att_heads,
-                concat=True,
+                concat=False,
                 root_weight=False,
                 edge_dim=num_filters,
             )
@@ -286,7 +286,7 @@ def __init__(
                 hidden_channels,
                 hidden_channels,
                 heads=att_heads,
-                concat=True,
+                concat=False,
                 root_weight=False,
                 edge_dim=num_filters,
             )

From 4578584edb71fa7e02d7afdb91b3225212f21e31 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 13 Jan 2023 19:41:43 -0500
Subject: [PATCH 096/273] remove bad defaults

---
 configs/exps/icml/is2re-10k/fanet-orion.yaml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml
index 8d770b152c..0f9500b973 100644
--- a/configs/exps/icml/is2re-10k/fanet-orion.yaml
+++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml
@@ -5,8 +5,6 @@ job:
   gres: gpu:1
   time: 30:00
   partition: long
-  code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
-  env: ocp-a100
 
 default:
   wandb_project: ocp-4
@@ -23,7 +21,7 @@ default:
     ema_decay: 0.999
     decay_steps: max_steps
     scheduler: LinearWarmupCosineAnnealingLR
-    batch_size: 256
+    batch_size: 64
   note:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
     optim: batch_size, lr_initial
@@ -36,7 +34,7 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-is2re-10k-v1.2.0
+  unique_exp_name: fanet-is2re-10k-v1.3.0
 
   space:
     optim/max_epochs: fidelity(20, 100, base=4)

From a498ac8f4b20cbfb91f2ea27cdb734dd485eb875 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 10:58:35 -0500
Subject: [PATCH 097/273] add `no_metrics_denorm` flag

---
 configs/exps/qm7x/schnet-from-spooky.yaml | 38 +++++++++++++++++------
 ocpmodels/common/flags.py                 |  8 +++++
 ocpmodels/trainers/single_trainer.py      | 16 +++++++---
 3 files changed, 49 insertions(+), 13 deletions(-)

diff --git a/configs/exps/qm7x/schnet-from-spooky.yaml b/configs/exps/qm7x/schnet-from-spooky.yaml
index 5c8dce6fe9..5ad2fa09e5 100644
--- a/configs/exps/qm7x/schnet-from-spooky.yaml
+++ b/configs/exps/qm7x/schnet-from-spooky.yaml
@@ -12,7 +12,7 @@ default:
   wandb_project: ocp-qm
   mode: train
   test_ri: true
-  wandb_tags: qm7x
+  wandb_tags: qm7x, no_metrics_denorm
   phys_hidden_channels: 0
   phys_embeds: False
   energy_head: False
@@ -20,6 +20,7 @@ default:
   tag_hidden_channels: 0
   frame_averaging: ""
   cp_data_to_tmpdir: true
+  no_metrics_denorm: true
   note:
     task: name
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions
@@ -42,21 +43,40 @@ default:
     num_interactions: 6
     cutoff: 5.0
 
-runs:
+# runs:
+#   - optim:
+#       ema_decay: 0.999
+#   - optim:
+#       scheduler: LinearWarmupCosineAnnealingLR
+#     model:
+#       cutoff: 6.0
+#   - optim:
+#       scheduler: LinearWarmupCosineAnnealingLR
+#     model:
+#       num_gaussians: 100
+#   - optim:
+#       scheduler: LinearWarmupCosineAnnealingLR
+#     model:
+#       num_filters: 256
+#   - optim:
+#       ema_decay: 0.999
+#       scheduler: LinearWarmupCosineAnnealingLR
+
+runs: # all above contributed positively to improve eval/val_ood/energy_mae.
+      # so we're combining them here. + test with slightly larger batch size.
+      # And with no_metrics_denorm.
   - optim:
       ema_decay: 0.999
-  - optim:
       scheduler: LinearWarmupCosineAnnealingLR
     model:
       cutoff: 6.0
-  - optim:
-      scheduler: LinearWarmupCosineAnnealingLR
-    model:
+      num_filters: 256
       num_gaussians: 100
   - optim:
+      batch_size: 32
+      ema_decay: 0.999
       scheduler: LinearWarmupCosineAnnealingLR
     model:
+      cutoff: 6.0
       num_filters: 256
-  - optim:
-      ema_decay: 0.999
-      scheduler: LinearWarmupCosineAnnealingLR
+      num_gaussians: 100
\ No newline at end of file
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 519465386b..1199c344ba 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -263,6 +263,14 @@ def add_core_args(self):
             + " the search space MUST be the same. If it is not, the job will crash."
             + " If you change the search space, you must change the experiment name.",
         )
+        self.parser.add_argument(
+            "--no_metrics_denorm",
+            type=bool,
+            default=False,
+            help="Whether or not to disable prediction denormalization to compute"
+            + " metrics. If True, targets are normalized instead of denormalizing "
+            + "preds.",
+        )
 
 
 flags = Flags()
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index d964ffeba9..eeb1b50356 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -594,12 +594,20 @@ def compute_metrics(
                 self.normalizer.get("normalize_labels")
                 and "grad_target" in self.normalizers
             ):
-                preds["forces"] = self.normalizers["grad_target"].denorm(
-                    preds["forces"]
-                )
+                if not self.config.get("no_metrics_denorm"):
+                    preds["forces"] = self.normalizers["grad_target"].denorm(
+                        preds["forces"]
+                    )
+                else:
+                    target["forces"] = self.normalizers["grad_target"].norm(
+                        target["forces"]
+                    )
 
         if self.normalizer.get("normalize_labels") and "target" in self.normalizers:
-            preds["energy"] = self.normalizers["target"].denorm(preds["energy"])
+            if not self.config.get("no_metrics_denorm"):
+                preds["energy"] = self.normalizers["target"].denorm(preds["energy"])
+            else:
+                target["energy"] = self.normalizers["target"].norm(target["energy"])
 
         metrics = evaluator.eval(preds, target, prev_metrics=metrics)
 

From 772896c062bac622c7c0b6328f512f4879f15f9b Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 11:01:20 -0500
Subject: [PATCH 098/273] prints

---
 launch_exp.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index da3d16c6f9..5ecf95e743 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -25,6 +25,8 @@ def util_strings(jobs, yaml_comments=False):
     )
     if yaml_comments:
         s = "\n".join(["# " + line for line in s.splitlines()])
+    else:
+        s = "\n  • ".join(s.splitlines())
     return s
 
 
@@ -231,7 +233,7 @@ def get_args_or_exp(key, args, exp):
     text += "\n<><><> Experiment config:\n\n-----" + exp_file.read_text() + "-----"
     text += "\n<><><> Experiment runs:\n\n • " + "\n\n  • ".join(commands) + separator
 
-    confirm = input("\n🚦 Confirm? [y/n]")
+    confirm = input("\n🚦 Confirm? [y/n] : ")
 
     if confirm == "y":
         try:
@@ -262,11 +264,12 @@ def get_args_or_exp(key, args, exp):
             text += f"{separator}All jobs launched: {' '.join(jobs)}"
             with outfile.open("w") as f:
                 f.write(text)
-            print(f"Output written to {str(outfile)}")
+            print("\n🎉 Done!")
+            print(f"  • Output written to {str(outfile)}")
             print(util_strings(jobs))
             yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs)
             print(
-                "Experiment summary YAML in ",
+                "  • Experiment summary YAML in ",
                 f"./{str(yml_out.relative_to(Path.cwd()))}",
             )
     else:

From 651dd81b22501833de8f0a4b5c51eeb2fb60b066 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 11:19:21 -0500
Subject: [PATCH 099/273] allow for comments in `runs: ` line

---
 configs/exps/qm7x/schnet-from-spooky.yaml | 13 +++++++++++--
 launch_exp.py                             | 21 +++++++++++----------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/configs/exps/qm7x/schnet-from-spooky.yaml b/configs/exps/qm7x/schnet-from-spooky.yaml
index 5ad2fa09e5..597174bc91 100644
--- a/configs/exps/qm7x/schnet-from-spooky.yaml
+++ b/configs/exps/qm7x/schnet-from-spooky.yaml
@@ -1,7 +1,7 @@
 # trainset has 4068193 samples
 job:
-  mem: 32GB
-  cpus: 8
+  mem: 12GB
+  cpus: 4
   gres: gpu:1
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
@@ -76,6 +76,15 @@ runs: # all above contributed positively to improve eval/val_ood/energy_mae.
       batch_size: 32
       ema_decay: 0.999
       scheduler: LinearWarmupCosineAnnealingLR
+    model:
+      cutoff: 6.0
+      num_filters: 256
+      num_gaussians: 100
+  - optim:
+      batch_size: 512
+      lr_initial: 0.0005
+      ema_decay: 0.999
+      scheduler: LinearWarmupCosineAnnealingLR
     model:
       cutoff: 6.0
       num_filters: 256
diff --git a/launch_exp.py b/launch_exp.py
index 5ecf95e743..1bb0702615 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -14,11 +14,11 @@
 
 
 def util_strings(jobs, yaml_comments=False):
-    s = "All jobs launched: " + ", ".join(jobs)
-    s += "\nCancel experiment:\n  $ scancel " + " ".join(jobs)
-    s += "\nWandB query for dashboard:\n  (" + "|".join(jobs) + ")"
+    s = "  • All jobs launched: " + ", ".join(jobs)
+    s += "\n  • Cancel experiment:\n  $ scancel " + " ".join(jobs)
+    s += "\n  • WandB query for dashboard:\n  (" + "|".join(jobs) + ")"
     s += (
-        "\nDelete experiment run dirs:\n  $ "
+        "\n  • Delete experiment run dirs:\n  $ "
         + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
         + " ".join(jobs)
         + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;'
@@ -26,7 +26,7 @@ def util_strings(jobs, yaml_comments=False):
     if yaml_comments:
         s = "\n".join(["# " + line for line in s.splitlines()])
     else:
-        s = "\n  • ".join(s.splitlines())
+        s = "\n  │ ".join(s.splitlines())
     return s
 
 
@@ -88,8 +88,9 @@ def write_exp_yaml_and_jobs(exp_file, outfile, jobs):
         jobs (list[str]): List of jobs, one per run line in the yaml exp_file
     """
     lines = exp_file.read_text().splitlines()
-    if "runs:" in lines:
-        run_line = lines.index("runs:")
+    run_lines = [i for i, l in enumerate(lines) if l.strip().startswith("runs:")]
+    if run_lines:
+        run_line = run_lines[0]
         j = 0
         for i, line in enumerate(lines[run_line:]):
             if line.strip().startswith("- "):
@@ -244,7 +245,7 @@ def get_args_or_exp(key, args, exp):
 
             outputs = []
             for c, command in enumerate(commands):
-                print(f"Launching job {c:3}", end="\r")
+                print(f"Launching job {c+1:3}", end="\r")
                 outputs.append(os.popen(command).read().strip())
         except KeyboardInterrupt:
             is_interrupted = True
@@ -264,9 +265,9 @@ def get_args_or_exp(key, args, exp):
             text += f"{separator}All jobs launched: {' '.join(jobs)}"
             with outfile.open("w") as f:
                 f.write(text)
-            print("\n🎉 Done!")
-            print(f"  • Output written to {str(outfile)}")
+            print("\n\n ✅ Done!")
             print(util_strings(jobs))
+            # print(f"  • Output written to {str(outfile)}")
             yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs)
             print(
                 "  • Experiment summary YAML in ",

From dc8b494cac2bddf0208486b24e77d2de7c1f03c9 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 11:37:35 -0500
Subject: [PATCH 100/273] explicit `optimizer: AdamW` in configs

---
 configs/models/tasks/is2re.yaml | 3 ++-
 configs/models/tasks/qm7x.yaml  | 1 +
 configs/models/tasks/qm9.yaml   | 3 +++
 configs/models/tasks/s2ef.yaml  | 2 ++
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/configs/models/tasks/is2re.yaml b/configs/models/tasks/is2re.yaml
index 059ef62c53..cf47f159de 100644
--- a/configs/models/tasks/is2re.yaml
+++ b/configs/models/tasks/is2re.yaml
@@ -9,7 +9,8 @@ default:
     metric: mae
     labels:
       - relaxed energy
-
+  optim:
+    optimizer: AdamW
   normalizer: null
   model:
     otf_graph: False
diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml
index af410f6f70..98de512a2d 100644
--- a/configs/models/tasks/qm7x.yaml
+++ b/configs/models/tasks/qm7x.yaml
@@ -17,6 +17,7 @@ default:
       - total system energy
 
   optim:
+    optimizer: AdamW
     force_coefficient: 30
     energy_coefficient: 1
     energy_grad_coefficient: 10
diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml
index 262ec232e7..e53c071188 100644
--- a/configs/models/tasks/qm9.yaml
+++ b/configs/models/tasks/qm9.yaml
@@ -9,6 +9,9 @@ default:
     use_pbc: False
     force_decoder_type: null
 
+  optim:
+    optimizer: AdamW
+
   task:
     dataset: qm9
     description: "QM9 U0 internal energy at 0K prediction from structure structure."
diff --git a/configs/models/tasks/s2ef.yaml b/configs/models/tasks/s2ef.yaml
index 92c08cdcf3..4916788b07 100644
--- a/configs/models/tasks/s2ef.yaml
+++ b/configs/models/tasks/s2ef.yaml
@@ -13,6 +13,8 @@ default:
     eval_on_free_atoms: True
   normalizer: null
   mode: train
+  optim:
+    optimizer: AdamW
   model:
     otf_graph: False
     max_num_neighbors: 40

From 171261127a2c2d82e81f9d67624137ac572f40cd Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 12:06:47 -0500
Subject: [PATCH 101/273] orion qm9 exp

---
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 58 ++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9.yaml

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
new file mode 100644
index 0000000000..6b4f5b5877
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -0,0 +1,58 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 8GB
+  cpus: 4
+  gres: gpu:1
+  time: 30:00
+  partition: long
+
+default:
+  wandb_project: ocp-4
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, orion
+  log_train_every: 100
+  optim:
+    warmup_steps: 2000
+    # parameters EMA
+    ema_decay: 0.999
+    decay_steps: max_steps
+    scheduler: LinearWarmupCosineAnnealingLR
+    batch_size: 64
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+  frame_averaging: 3D
+  fa_frames: random
+  model:
+    edge_embed_type: all_rij
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 20
+
+  unique_exp_name: fanet-qm9-v1.0.0
+
+  space:
+    optim/max_epochs: fidelity(30, 300, base=6)
+    optim/lr_initial: loguniform(1e-4, 5e-3, precision=2)
+    model/graph_norm: choices([True, False])
+    model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
+    model/hidden_channels: uniform(5, 16, discrete=True)
+    model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
+    model/num_filters: uniform(3, 16, discrete=True)
+    model/num_gaussians: uniform(20, 150, discrete=True)
+    model/num_interactions: uniform(1, 7, discrete=True)
+    model/pg_hidden_channels: uniform(0, 2, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/tag_hidden_channels: uniform(0, 2, discrete=True)
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2

From fed104f4c1da9a557815ec58b1ebeb01ade24a8f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 12:20:12 -0500
Subject: [PATCH 102/273] `IS_NARVAL`

---
 ocpmodels/common/logger.py | 11 +++++++----
 ocpmodels/common/utils.py  | 10 +++++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index dab33affcc..c84734e129 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -14,6 +14,7 @@
 
 import wandb
 from ocpmodels.common.registry import registry
+from ocpmodels.common.utils import IS_NARVAL
 
 NTFY_OK = False
 try:
@@ -124,14 +125,15 @@ def __init__(self, trainer_config):
         sbatch_files = list(
             Path(self.trainer_config["run_dir"]).glob("sbatch_script*.sh")
         )
-        if len(sbatch_files) == 1:
+        if len(sbatch_files) == 1 and not IS_NARVAL:
             wandb.save(str(sbatch_files[0]))
 
         self.url = wandb.run.get_url()
         with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f:
             f.write(self.url)
-        self.collect_output_files(policy="live")
-        self.collect_output_files(policy="end")
+        if not IS_NARVAL:
+            self.collect_output_files(policy="live")
+            self.collect_output_files(policy="end")
 
     def watch(self, model):
         wandb.watch(model)
@@ -169,7 +171,8 @@ def finish(self, error_or_signal=False):
             self.add_tags("Preempted")
         if error_or_signal is True:
             exit_code = 1
-        self.collect_output_files(policy="now")
+        if not IS_NARVAL:
+            self.collect_output_files(policy="now")
         wandb.finish(exit_code=exit_code)
 
 
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 73d2d00925..f81ba0ade5 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -42,6 +42,10 @@
 OCP_TASKS = {"s2ef", "is2re", "is2es"}
 ROOT = Path(__file__).resolve().parent.parent.parent
 JOB_ID = os.environ.get("SLURM_JOB_ID")
+IS_NARVAL = (
+    "narval.calcul.quebec" in os.environ.get("HOSTNAME", "")
+    or os.environ.get("HOME") == "/home/vsch"
+)
 
 
 def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."):
@@ -303,11 +307,7 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config):
 
 
 def override_narval_paths(trainer_config):
-    is_narval = (
-        "narval.calcul.quebec" in os.environ.get("HOSTNAME", "")
-        or os.environ.get("HOME") == "/home/vsch"
-        or trainer_config["narval"]
-    )
+    is_narval = IS_NARVAL or trainer_config.get("narval")
     if not is_narval:
         return trainer_config
     path_overrides = yaml.safe_load(

From 3960dd8ac9c947a5add2a0cd6ebabde17e83a0b0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 12:25:50 -0500
Subject: [PATCH 103/273] set default time

---
 sbatch.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/sbatch.py b/sbatch.py
index ea97c8a7df..a7a5fa2f45 100644
--- a/sbatch.py
+++ b/sbatch.py
@@ -8,6 +8,11 @@
 import re
 import yaml
 
+IS_NARVAL = (
+    "narval.calcul.quebec" in os.environ.get("HOSTNAME", "")
+    or os.environ.get("HOME") == "/home/vsch"
+)
+
 template = """\
 #!/bin/bash
 {sbatch_params}
@@ -212,7 +217,6 @@ def write_orion_config(args, outdir):
     # has the submission been successful?
     success = False
     sbatch_py_vars = {}
-    is_narval = "narval.calcul.quebec" in os.environ.get("HOSTNAME", "")
 
     # repository root
     root = Path(__file__).resolve().parent
@@ -292,9 +296,12 @@ def write_orion_config(args, outdir):
     }
     if args.time:
         sbatch_params["time"] = args.time
-    if is_narval:
+    if IS_NARVAL:
         del sbatch_params["partition"]
         sbatch_params["account"] = "rrg-bengioy-ad_gpu"
+        if "time" not in sbatch_params:
+            print("WARNING: no time limit specified, setting to 1 day")
+            sbatch_params["time"] = "1-00:00:00"
 
     if "a100" in args.env:
         modules += ["cuda/11.2"]

From fdcf667b649d8ad0420c4b9057bb2eb55b09e421 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 12:26:15 -0500
Subject: [PATCH 104/273] increase time

---
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index 6b4f5b5877..0ecb00c06c 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -3,7 +3,7 @@ job:
   mem: 8GB
   cpus: 4
   gres: gpu:1
-  time: 30:00
+  time: 02:50:00
   partition: long
 
 default:

From 3f9526ff88e3867115c156a1e79b1c5523c4229c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 12:27:51 -0500
Subject: [PATCH 105/273] fix seconds handling

---
 launch_exp.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/launch_exp.py b/launch_exp.py
index 1bb0702615..43c1d951ae 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -125,7 +125,10 @@ def find_exp(name):
 
 
 def seconds_to_time_str(seconds):
-    seconds = int(seconds)
+    try:
+        seconds = int(seconds)
+    except ValueError:
+        return seconds
     hours = seconds // 3600
     minutes = (seconds % 3600) // 60
     seconds = seconds % 60

From e126c67b4aafd4d0ed4c58a0d9935e53363a9c2c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 12:28:51 -0500
Subject: [PATCH 106/273] more space in prints

---
 launch_exp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index 43c1d951ae..764d8b535c 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -15,10 +15,10 @@
 
 def util_strings(jobs, yaml_comments=False):
     s = "  • All jobs launched: " + ", ".join(jobs)
-    s += "\n  • Cancel experiment:\n  $ scancel " + " ".join(jobs)
+    s += "\n  • Cancel experiment:\n    $ scancel " + " ".join(jobs)
     s += "\n  • WandB query for dashboard:\n  (" + "|".join(jobs) + ")"
     s += (
-        "\n  • Delete experiment run dirs:\n  $ "
+        "\n  • Delete experiment run dirs:\n    $ "
         + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
         + " ".join(jobs)
         + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;'

From aabcaa8189b66dd5ed72f77da598cffb2ec7100e Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Sat, 14 Jan 2023 12:44:05 -0500
Subject: [PATCH 107/273] first config orion IS2RE

---
 configs/exps/icml/is2re-10k/fanet-orion.yaml  |  4 +-
 .../exps/icml/is2re-all/fanet-orion-1.yaml    | 59 +++++++++++++++++++
 2 files changed, 61 insertions(+), 2 deletions(-)
 create mode 100644 configs/exps/icml/is2re-all/fanet-orion-1.yaml

diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml
index 8d770b152c..bd4c587b12 100644
--- a/configs/exps/icml/is2re-10k/fanet-orion.yaml
+++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml
@@ -5,8 +5,8 @@ job:
   gres: gpu:1
   time: 30:00
   partition: long
-  code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
-  env: ocp-a100
+  # code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
+  # env: ocp-a100
 
 default:
   wandb_project: ocp-4
diff --git a/configs/exps/icml/is2re-all/fanet-orion-1.yaml b/configs/exps/icml/is2re-all/fanet-orion-1.yaml
new file mode 100644
index 0000000000..a39db9ec02
--- /dev/null
+++ b/configs/exps/icml/is2re-all/fanet-orion-1.yaml
@@ -0,0 +1,59 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  time: 10:00:00
+  partition: long
+
+default:
+  wandb_project: ocp-3
+  config: fanet-is2re-all
+  mode: train
+  test_ri: true
+  wandb_tags: is2re-all, orion
+  cp_data_to_tmpdir: true
+  graph_rewiring: remove-tag-0
+  model: 
+    edge_embed_type: all_rij
+  frame_averaging: 2D
+  fa_frames: random
+  optim:
+    scheduler: LinearWarmupCosineAnnealingLR
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co
+    optim: lr_initial, warmup_steps
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 50
+
+  unique_exp_name: fanet-is2re-all-v1
+
+  space:
+    optim/max_epochs: fidelity(8, 30, base=6)
+    optim/lr_initial: loguniform(1e-4, 5e-3, precision=2)
+    model/graph_norm: choices([True, False])
+    model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
+    model/hidden_channels: uniform(5, 18, discrete=True)
+    model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
+    model/num_filters: uniform(2, 16, discrete=True)
+    model/num_gaussians: uniform(30, 150, discrete=True)
+    model/num_interactions: uniform(1, 6, discrete=True)
+    model/pg_hidden_channels: uniform(0, 2, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/tag_hidden_channels: uniform(0, 2, discrete=True)
+    model/complex_mp: choices([True, False])
+    model/att_heads: choices([1,2,3,4])
+    model/second_layer_MLP: choices([True, False])
+    model/skip_co: choices(["add", "concat", False])
+    model/cutoff: choices([4.0, 6.0, 10.0])
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2

From 6e0b58ed9b4afd14236a86b9920ce341f8e840b0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 12:45:28 -0500
Subject: [PATCH 108/273] add qm9 narval paths

---
 configs/models/tasks/_narval.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/configs/models/tasks/_narval.yaml b/configs/models/tasks/_narval.yaml
index 9a465c231f..9e43ec5a95 100644
--- a/configs/models/tasks/_narval.yaml
+++ b/configs/models/tasks/_narval.yaml
@@ -82,3 +82,12 @@ s2ef:
       src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both
     train:
       src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/train/
+
+qm9:
+  all:
+    train:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
+    val:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
+    test:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9

From 9ab12147a0c96436e1fdc49be256ca11813e80ed Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Sat, 14 Jan 2023 13:22:52 -0500
Subject: [PATCH 109/273] orion config s2ef

---
 .../exps/icml/is2re-all/fanet-orion-1.yaml    |  4 +-
 .../exps/icml/s2ef/fanet-orion-s2ef-1.yaml    | 64 +++++++++++++++++++
 2 files changed, 66 insertions(+), 2 deletions(-)
 create mode 100644 configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml

diff --git a/configs/exps/icml/is2re-all/fanet-orion-1.yaml b/configs/exps/icml/is2re-all/fanet-orion-1.yaml
index a39db9ec02..48068985a7 100644
--- a/configs/exps/icml/is2re-all/fanet-orion-1.yaml
+++ b/configs/exps/icml/is2re-all/fanet-orion-1.yaml
@@ -14,7 +14,7 @@ default:
   wandb_tags: is2re-all, orion
   cp_data_to_tmpdir: true
   graph_rewiring: remove-tag-0
-  model: 
+  model:
     edge_embed_type: all_rij
   frame_averaging: 2D
   fa_frames: random
@@ -30,7 +30,7 @@ default:
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
-  n_jobs: 50
+  n_jobs: 166
 
   unique_exp_name: fanet-is2re-all-v1
 
diff --git a/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml b/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml
new file mode 100644
index 0000000000..58c5c64538
--- /dev/null
+++ b/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml
@@ -0,0 +1,64 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:1
+  partition: long
+
+default:
+  wandb_project: ocp-3
+  config: fanet-s2ef-2M
+  mode: train
+  test_ri: true
+  wandb_tags: s2ef-2M, orion
+  cp_data_to_tmpdir: true
+  graph_rewiring: remove-tag-0
+  model:
+    edge_embed_type: all_rij
+  frame_averaging: 2D
+  fa_frames: random
+  optim:
+    scheduler: LinearWarmupCosineAnnealingLR
+    force_coefficient: 100
+    energy_coefficient: 1
+    energy_grad_coefficient: 5
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co
+    optim: lr_initial, warmup_steps
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 50
+
+  unique_exp_name: fanet-s2ef-2M-v1
+
+  space:
+    model/att_heads: choices([1,2,3,4])
+    model/complex_mp: choices([True, False])
+    model/cutoff: choices([4.0, 6.0, 10.0])
+    model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
+    model/graph_norm: choices([True, False])
+    model/hidden_channels: uniform(6, 22, discrete=True)
+    model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
+    model/num_filters: uniform(2, 18, discrete=True)
+    model/num_gaussians: uniform(30, 150, discrete=True)
+    model/num_interactions: uniform(3, 6, discrete=True)
+    model/pg_hidden_channels: uniform(0, 1, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/regress_forces: choices(["direct_with_gradient_target", "direct"])
+    model/second_layer_MLP: choices([True, False])
+    model/skip_co: choices(["add", "concat", False])
+    model/tag_hidden_channels: uniform(0, 2, discrete=True)
+    model/max_num_neighbors: choices([30,40,50])
+    optim/lr_initial: loguniform(5e-5, 5e-4, precision=2)
+    optim/max_epochs: fidelity(6, 22, base=6)
+
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2

From f0c0c3cae30e8189b01e540adce30d30022d0ca2 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 13:30:17 -0500
Subject: [PATCH 110/273] small regress_forces fix

---
 launch_exp.py             | 2 +-
 ocpmodels/common/utils.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/launch_exp.py b/launch_exp.py
index 764d8b535c..867579ad81 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -16,7 +16,7 @@
 def util_strings(jobs, yaml_comments=False):
     s = "  • All jobs launched: " + ", ".join(jobs)
     s += "\n  • Cancel experiment:\n    $ scancel " + " ".join(jobs)
-    s += "\n  • WandB query for dashboard:\n  (" + "|".join(jobs) + ")"
+    s += "\n  • WandB query for dashboard:\n    (" + "|".join(jobs) + ")"
     s += (
         "\n  • Delete experiment run dirs:\n    $ "
         + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index f81ba0ade5..70dd0812c0 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -974,6 +974,8 @@ def build_config(args, args_override):
     config["job_id"] = JOB_ID or "no-job-id"
 
     if "regress_forces" in config["model"]:
+        if config["model"]["regress_forces"] == "":
+            config["model"]["regress_forces"] = False
         if not isinstance(config["model"]["regress_forces"], str):
             if config["model"]["regress_forces"] is False:
                 config["model"]["regress_forces"] = ""

From e1304e52dea63b0e52f96753209647555a6c9b47 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 13:33:44 -0500
Subject: [PATCH 111/273] update fanet config with `use_pbc: False` for qm7x/9

---
 configs/models/fanet.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml
index 0c789467b3..fe94635ca1 100644
--- a/configs/models/fanet.yaml
+++ b/configs/models/fanet.yaml
@@ -164,6 +164,7 @@ qm7x:
     model:
       hidden_channels: 384
       num_interactions: 4
+      use_pbc: False
 
     optim:
       lr_initial: 0.001
@@ -176,3 +177,10 @@ qm7x:
 
   all: {}
   1k: {}
+
+qm9:
+  default:
+    model:
+      use_pbc: False
+  all: {}
+  10k: {}

From e7318fddd251676683ce0386a37a66420b510442 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 13:38:19 -0500
Subject: [PATCH 112/273] update `parse_value`

---
 ocpmodels/common/utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 70dd0812c0..e6dfe3572d 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -851,6 +851,10 @@ def parse_value(value):
     Parse string as Python literal if possible and fallback to string.
     """
     try:
+        if value.lower() == "true":
+            return True
+        elif value.lower() == "false":
+            return False
         return ast.literal_eval(value)
     except (ValueError, SyntaxError):
         # Use as string if nothing else worked

From 6ec231a814dd249ede1e380950ed09cfd4020b7c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 13:50:24 -0500
Subject: [PATCH 113/273] fix `tag_hidden_channels: 0` in qm9 exp

---
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index 0ecb00c06c..74031b7ff5 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -21,12 +21,12 @@ default:
     scheduler: LinearWarmupCosineAnnealingLR
     batch_size: 64
   note:
-    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
     optim: batch_size, lr_initial
     _root_: frame_averaging, fa_frames
   orion_mult_factor:
     value: 32
-    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels
   frame_averaging: 3D
   fa_frames: random
   model:
@@ -50,7 +50,6 @@ orion:
     model/num_interactions: uniform(1, 7, discrete=True)
     model/pg_hidden_channels: uniform(0, 2, discrete=True)
     model/phys_embeds: choices([True, False])
-    model/tag_hidden_channels: uniform(0, 2, discrete=True)
   algorithms:
     asha:
       seed: 123

From 2e5323004c337924b436f799daffc4ca94679e5a Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 13:52:25 -0500
Subject: [PATCH 114/273] update name

---
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index 74031b7ff5..a33b8bb704 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -36,7 +36,7 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-qm9-v1.0.0
+  unique_exp_name: fanet-qm9-v1.0.1
 
   space:
     optim/max_epochs: fidelity(30, 300, base=6)

From a3d36926614899b94fb6817274b7265307137835 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 14:15:36 -0500
Subject: [PATCH 115/273] refactor to DRAC, not narval only

---
 configs/models/tasks/_drac.yaml   | 97 +++++++++++++++++++++++++++++++
 configs/models/tasks/_narval.yaml | 93 -----------------------------
 launch_exp.py                     |  2 +-
 ocpmodels/common/flags.py         |  6 --
 ocpmodels/common/logger.py        |  8 +--
 ocpmodels/common/utils.py         | 49 +++++++++++-----
 sbatch.py                         |  5 +-
 7 files changed, 141 insertions(+), 119 deletions(-)
 create mode 100644 configs/models/tasks/_drac.yaml
 delete mode 100644 configs/models/tasks/_narval.yaml

diff --git a/configs/models/tasks/_drac.yaml b/configs/models/tasks/_drac.yaml
new file mode 100644
index 0000000000..f623e4d860
--- /dev/null
+++ b/configs/models/tasks/_drac.yaml
@@ -0,0 +1,97 @@
+# this file overrides paths for data on drac clusters
+drac_base_path:
+  narval: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data
+  beluga: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data
+
+is2re:
+  10k:
+    val_id:
+      src: _base_/oc20/is2re/all/val_id/data.lmdb
+    val_ood_cat:
+      src: _base_/oc20/is2re/all/val_ood_cat/data.lmdb
+    val_ood_ads:
+      src: _base_/oc20/is2re/all/val_ood_ads/data.lmdb
+    val_ood_both:
+      src: _base_/oc20/is2re/all/val_ood_both/data.lmdb
+    train:
+      src: _base_/oc20/is2re/10k/train/data.lmdb
+  100k:
+    val_id:
+      src: _base_/oc20/is2re/all/val_id/data.lmdb
+    val_ood_cat:
+      src: _base_/oc20/is2re/all/val_ood_cat/data.lmdb
+    val_ood_ads:
+      src: _base_/oc20/is2re/all/val_ood_ads/data.lmdb
+    val_ood_both:
+      src: _base_/oc20/is2re/all/val_ood_both/data.lmdb
+    train:
+      src: _base_/oc20/is2re/100k/train/data.lmdb
+
+  all:
+    val_id:
+      src: _base_/oc20/is2re/all/val_id/data.lmdb
+    val_ood_cat:
+      src: _base_/oc20/is2re/all/val_ood_cat/data.lmdb
+    val_ood_ads:
+      src: _base_/oc20/is2re/all/val_ood_ads/data.lmdb
+    val_ood_both:
+      src: _base_/oc20/is2re/all/val_ood_both/data.lmdb
+    train:
+      src: _base_/oc20/is2re/all/train/data.lmdb
+s2ef:
+  200k:
+    val_id:
+      src: _base_/oc20/s2ef/all/val_id
+    val_ood_cat:
+      src: _base_/oc20/s2ef/all/val_ood_cat
+    val_ood_ads:
+      src: _base_/oc20/s2ef/all/val_ood_ads
+    val_ood_both:
+      src: _base_/oc20/s2ef/all/val_ood_both
+    train:
+      src: _base_/oc20/s2ef/200k/train
+
+  2M:
+    val_id:
+      src: _base_/oc20/s2ef/all/val_id
+    val_ood_cat:
+      src: _base_/oc20/s2ef/all/val_ood_cat
+    val_ood_ads:
+      src: _base_/oc20/s2ef/all/val_ood_ads
+    val_ood_both:
+      src: _base_/oc20/s2ef/all/val_ood_both
+    train:
+      src: _base_/oc20/s2ef/2M/train/
+
+  20M:
+    val_id:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id
+    val_ood_cat:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat
+    val_ood_ads:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads
+    val_ood_both:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both
+    train:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/20M/train/
+
+  all:
+    val_id:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id
+    val_ood_cat:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat
+    val_ood_ads:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads
+    val_ood_both:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both
+    train:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/train/
+
+qm9:
+  all:
+    train:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
+    val:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
+    test:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
diff --git a/configs/models/tasks/_narval.yaml b/configs/models/tasks/_narval.yaml
deleted file mode 100644
index 9e43ec5a95..0000000000
--- a/configs/models/tasks/_narval.yaml
+++ /dev/null
@@ -1,93 +0,0 @@
-# this file overrides paths for data on Narval
-is2re:
-  10k:
-    val_id:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_id/data.lmdb
-    val_ood_cat:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_cat/data.lmdb
-    val_ood_ads:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_ads/data.lmdb
-    val_ood_both:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_both/data.lmdb
-    train:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/10k/train/data.lmdb
-  100k:
-    val_id:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_id/data.lmdb
-    val_ood_cat:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_cat/data.lmdb
-    val_ood_ads:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_ads/data.lmdb
-    val_ood_both:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_both/data.lmdb
-    train:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/100k/train/data.lmdb
-
-  all:
-    val_id:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_id/data.lmdb
-    val_ood_cat:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_cat/data.lmdb
-    val_ood_ads:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_ads/data.lmdb
-    val_ood_both:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/val_ood_both/data.lmdb
-    train:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/is2re/all/train/data.lmdb
-s2ef:
-  200k:
-    val_id:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id
-    val_ood_cat:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat
-    val_ood_ads:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads
-    val_ood_both:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both
-    train:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/200k/train
-
-  2M:
-    val_id:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id
-    val_ood_cat:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat
-    val_ood_ads:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads
-    val_ood_both:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both
-    train:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/2M/train/
-
-  20M:
-    val_id:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id
-    val_ood_cat:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat
-    val_ood_ads:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads
-    val_ood_both:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both
-    train:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/20M/train/
-
-  all:
-    val_id:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_id
-    val_ood_cat:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_cat
-    val_ood_ads:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_ads
-    val_ood_both:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/val_ood_both
-    train:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/oc20/s2ef/all/train/
-
-qm9:
-  all:
-    train:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
-    val:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
-    test:
-      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
diff --git a/launch_exp.py b/launch_exp.py
index 867579ad81..0d90f7b95b 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -19,7 +19,7 @@ def util_strings(jobs, yaml_comments=False):
     s += "\n  • WandB query for dashboard:\n    (" + "|".join(jobs) + ")"
     s += (
         "\n  • Delete experiment run dirs:\n    $ "
-        + 'ocp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
+        + 'exp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
         + " ".join(jobs)
         + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;'
     )
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 1199c344ba..77cb140cc3 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -231,12 +231,6 @@ def add_core_args(self):
             type=bool,
             help="Evaluate on test set",
         )
-        self.parser.add_argument(
-            "--narval",
-            action="store_true",
-            default=False,
-            help="is on Narval DRAC cluster",
-        )
         self.parser.add_argument(
             "--cp_data_to_tmpdir",
             type=bool,
diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index c84734e129..0a3cddfb5b 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -14,7 +14,7 @@
 
 import wandb
 from ocpmodels.common.registry import registry
-from ocpmodels.common.utils import IS_NARVAL
+from ocpmodels.common.utils import CLUSTER
 
 NTFY_OK = False
 try:
@@ -125,13 +125,13 @@ def __init__(self, trainer_config):
         sbatch_files = list(
             Path(self.trainer_config["run_dir"]).glob("sbatch_script*.sh")
         )
-        if len(sbatch_files) == 1 and not IS_NARVAL:
+        if len(sbatch_files) == 1 and not CLUSTER.drac:
             wandb.save(str(sbatch_files[0]))
 
         self.url = wandb.run.get_url()
         with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f:
             f.write(self.url)
-        if not IS_NARVAL:
+        if not CLUSTER.drac:
             self.collect_output_files(policy="live")
             self.collect_output_files(policy="end")
 
@@ -171,7 +171,7 @@ def finish(self, error_or_signal=False):
             self.add_tags("Preempted")
         if error_or_signal is True:
             exit_code = 1
-        if not IS_NARVAL:
+        if not CLUSTER.drac:
             self.collect_output_files(policy="now")
         wandb.finish(exit_code=exit_code)
 
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index e6dfe3572d..a84dc73fee 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -39,13 +39,28 @@
 from ocpmodels.common.flags import flags
 from ocpmodels.common.registry import registry
 
+
+class Cluster:
+    def __init__(self):
+        self._is = {
+            "narval": "narval.calcul.quebec" in os.environ.get("HOSTNAME", ""),
+            "beluga": "beluga.calcul.quebec" in os.environ.get("HOSTNAME", ""),
+            "mila": "/home/mila/" in os.environ.get("HOME", ""),
+        }
+        self.name = [k for k, v in self._is.items() if v][0].capitalize()
+        self.Name = self.name.capitalize()
+        self._id["drac"] = self._is["narval"] or self._is["beluga"]
+
+    def __getattribute__(self, k: str):
+        if k in self._is:
+            return self._is[k]
+        raise AttributeError("Unknown attribute " + k)
+
+
+CLUSTER = Cluster()
 OCP_TASKS = {"s2ef", "is2re", "is2es"}
 ROOT = Path(__file__).resolve().parent.parent.parent
 JOB_ID = os.environ.get("SLURM_JOB_ID")
-IS_NARVAL = (
-    "narval.calcul.quebec" in os.environ.get("HOSTNAME", "")
-    or os.environ.get("HOME") == "/home/vsch"
-)
 
 
 def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."):
@@ -306,23 +321,32 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config):
     return trainer_config
 
 
-def override_narval_paths(trainer_config):
-    is_narval = IS_NARVAL or trainer_config.get("narval")
-    if not is_narval:
+def override_drac_paths(trainer_config):
+    if not CLUSTER.drac:
         return trainer_config
+
     path_overrides = yaml.safe_load(
-        (ROOT / "configs" / "models" / "tasks" / "_narval.yaml").read_text()
+        (ROOT / "configs" / "models" / "tasks" / "_drac.yaml").read_text()
     )
+    base_path = path_overrides["drac_base_path"][CLUSTER.name]
     task = trainer_config["task"]["name"]
     split = trainer_config["task"]["split"]
-    assert task in path_overrides, f"Task {task} not found in Narval paths overrides"
+    assert (
+        task in path_overrides
+    ), f"Task {task} not found in {CLUSTER.Name} paths overrides"
 
     assert (
         split in path_overrides[task]
-    ), f"Split {split} not found in Narval paths overrides for task {task}"
+    ), f"Split {split} not found in {CLUSTER.Name} paths overrides for task {task}"
+
+    for t, task in copy.deepcopy(path_overrides).items():
+        for sub, subset in task.items():
+            for spl, split in subset.items():
+                src = split["src"].replace("_base_", base_path).replace("//", "/")
+                path_overrides[t][sub][spl]["src"] = src
 
     print(
-        "Is on Narval. Overriding",
+        f"Is on {CLUSTER.Name}. Overriding",
         trainer_config["dataset"],
         "with",
         path_overrides[task][split],
@@ -1002,7 +1026,7 @@ def build_config(args, args_override):
 
     config = set_qm9_target_stats(config)
     config = set_qm7x_target_stats(config)
-    config = override_narval_paths(config)
+    config = override_drac_paths(config)
 
     if not config["no_cpus_to_workers"]:
         cpus = count_cpus()
@@ -1547,7 +1571,6 @@ def base_config(config, overrides={}):
         n,
         [
             "run_dir=.",
-            "narval=",
             "no_qm7x_cp=true",
             "no_cpus_to_workers=true",
             "silent=",
diff --git a/sbatch.py b/sbatch.py
index a7a5fa2f45..2a6cfd57e1 100644
--- a/sbatch.py
+++ b/sbatch.py
@@ -8,8 +8,9 @@
 import re
 import yaml
 
-IS_NARVAL = (
+IS_DRAC = (
     "narval.calcul.quebec" in os.environ.get("HOSTNAME", "")
+    or "beluga.calcul.quebec" in os.environ.get("HOSTNAME", "")
     or os.environ.get("HOME") == "/home/vsch"
 )
 
@@ -296,7 +297,7 @@ def write_orion_config(args, outdir):
     }
     if args.time:
         sbatch_params["time"] = args.time
-    if IS_NARVAL:
+    if IS_DRAC:
         del sbatch_params["partition"]
         sbatch_params["account"] = "rrg-bengioy-ad_gpu"
         if "time" not in sbatch_params:

From c522a5a84087aaf841f2b17f042669e28593bc45 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 14:17:57 -0500
Subject: [PATCH 116/273] log cluster name

---
 ocpmodels/common/utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index a84dc73fee..07d892973f 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -47,9 +47,13 @@ def __init__(self):
             "beluga": "beluga.calcul.quebec" in os.environ.get("HOSTNAME", ""),
             "mila": "/home/mila/" in os.environ.get("HOME", ""),
         }
-        self.name = [k for k, v in self._is.items() if v][0].capitalize()
+        self.name = [k for k, v in self._is.items() if v]
+        if not self.name:
+            self.name = "unknown"
+        else:
+            self.name = self.name[0]
         self.Name = self.name.capitalize()
-        self._id["drac"] = self._is["narval"] or self._is["beluga"]
+        self._is["drac"] = self._is["narval"] or self._is["beluga"]
 
     def __getattribute__(self, k: str):
         if k in self._is:
@@ -1000,6 +1004,7 @@ def build_config(args, args_override):
     config["run_dir"] = resolve(config["run_dir"])
     config["slurm"] = {}
     config["job_id"] = JOB_ID or "no-job-id"
+    config["cluster_name"] = CLUSTER.name
 
     if "regress_forces" in config["model"]:
         if config["model"]["regress_forces"] == "":

From c11b4211c1bc75bb3847a88ac4a00d277fd9e276 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 15:53:22 -0500
Subject: [PATCH 117/273] error in getattribute

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 07d892973f..aa21519d31 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -55,7 +55,7 @@ def __init__(self):
         self.Name = self.name.capitalize()
         self._is["drac"] = self._is["narval"] or self._is["beluga"]
 
-    def __getattribute__(self, k: str):
+    def __getattr__(self, k: str):
         if k in self._is:
             return self._is[k]
         raise AttributeError("Unknown attribute " + k)

From 3ef4445bfdcfdda70a88088e129cfacaa5503c69 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 15:54:02 -0500
Subject: [PATCH 118/273] typo

---
 launch_exp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launch_exp.py b/launch_exp.py
index 0d90f7b95b..e2c55e6eab 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -21,7 +21,7 @@ def util_strings(jobs, yaml_comments=False):
         "\n  • Delete experiment run dirs:\n    $ "
         + 'exp_run_dirs="$SCRATCH/ocp/runs"; for jid in '
         + " ".join(jobs)
-        + '; do rm -rf "$ocp_run_dirs/$jid" && echo "Deleted $ocp_run_dirs/$jid"; done;'
+        + '; do rm -rf "$exp_run_dirs/$jid" && echo "Deleted $exp_run_dirs/$jid"; done;'
     )
     if yaml_comments:
         s = "\n".join(["# " + line for line in s.splitlines()])

From 39d6c3690cd9bc40951bc8f6b9a8c7fac444aeae Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 16:51:48 -0500
Subject: [PATCH 119/273] use `CC_CLUSTER` env var

---
 ocpmodels/common/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index aa21519d31..bfa9d208c9 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -43,8 +43,8 @@
 class Cluster:
     def __init__(self):
         self._is = {
-            "narval": "narval.calcul.quebec" in os.environ.get("HOSTNAME", ""),
-            "beluga": "beluga.calcul.quebec" in os.environ.get("HOSTNAME", ""),
+            "narval": os.environ.get("CC_CLUSTER") == "narval",
+            "beluga": os.environ.get("CC_CLUSTER") == "beluga",
             "mila": "/home/mila/" in os.environ.get("HOME", ""),
         }
         self.name = [k for k, v in self._is.items() if v]

From fcbc47b874610579f8ae25c6d346547e08e10862 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 16:58:30 -0500
Subject: [PATCH 120/273] `pop` `drac_base_path`

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index bfa9d208c9..5ba27a16cf 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -332,7 +332,7 @@ def override_drac_paths(trainer_config):
     path_overrides = yaml.safe_load(
         (ROOT / "configs" / "models" / "tasks" / "_drac.yaml").read_text()
     )
-    base_path = path_overrides["drac_base_path"][CLUSTER.name]
+    base_path = path_overrides.pop("drac_base_path")[CLUSTER.name]
     task = trainer_config["task"]["name"]
     split = trainer_config["task"]["split"]
     assert (

From 5c62aa467fbae8b0e6de11a9fd3e51118824ca0b Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 17:35:26 -0500
Subject: [PATCH 121/273] fix loop var leak

---
 launch_exp.py             | 1 +
 ocpmodels/common/utils.py | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index e2c55e6eab..8ab6d86867 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -247,6 +247,7 @@ def get_args_or_exp(key, args, exp):
                 search_path.write_text(dump(exp["orion"]))
 
             outputs = []
+            print()
             for c, command in enumerate(commands):
                 print(f"Launching job {c+1:3}", end="\r")
                 outputs.append(os.popen(command).read().strip())
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 5ba27a16cf..241343d984 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -343,10 +343,10 @@ def override_drac_paths(trainer_config):
         split in path_overrides[task]
     ), f"Split {split} not found in {CLUSTER.Name} paths overrides for task {task}"
 
-    for t, task in copy.deepcopy(path_overrides).items():
-        for sub, subset in task.items():
-            for spl, split in subset.items():
-                src = split["src"].replace("_base_", base_path).replace("//", "/")
+    for t, task_dict in copy.deepcopy(path_overrides).items():
+        for sub, subset_dict in task_dict.items():
+            for spl, split_dict in subset_dict.items():
+                src = split_dict["src"].replace("_base_", base_path).replace("//", "/")
                 path_overrides[t][sub][spl]["src"] = src
 
     print(

From bb5f8779312d64ed476036ff8ede7a7a4b24e25d Mon Sep 17 00:00:00 2001
From: alexhernandezgarcia <alexhdezgarcia@gmail.com>
Date: Sat, 14 Jan 2023 18:06:32 -0500
Subject: [PATCH 122/273] add break line in wandb_url.txt

---
 configs/sbatch/alex.hernandez-garcia.yaml | 3 ++-
 ocpmodels/common/logger.py                | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/configs/sbatch/alex.hernandez-garcia.yaml b/configs/sbatch/alex.hernandez-garcia.yaml
index 7afa1f9a41..dfa273625e 100644
--- a/configs/sbatch/alex.hernandez-garcia.yaml
+++ b/configs/sbatch/alex.hernandez-garcia.yaml
@@ -1,4 +1,5 @@
 # Overwrites defaults.yaml for user `schmidtv`.
 # Create your own $USER.yaml in order to overwrite defaults.yaml systematically to your own taste.
 virtualenv: True
-env: /home/mila/a/alex.hernandez-garcia/.virtualenvs/ocp-torch1110cuda102
+env: /home/mila/a/alex.hernandez-garcia/.virtualenvs/ocp-torch1121cuda112
+modules: cuda/11.2, python/3.8
diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index 0a3cddfb5b..7d19ba2106 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -130,7 +130,7 @@ def __init__(self, trainer_config):
 
         self.url = wandb.run.get_url()
         with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f:
-            f.write(self.url)
+            f.write(self.url + "\n")
         if not CLUSTER.drac:
             self.collect_output_files(policy="live")
             self.collect_output_files(policy="end")

From 41d71517c68fe15c04290c3c6fb8f1309df76e4c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 20:12:39 -0500
Subject: [PATCH 123/273] async cancel

---
 main.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 5e4f99c427..1620005e9f 100644
--- a/main.py
+++ b/main.py
@@ -200,9 +200,9 @@ def run(self, orion_exp=None):
             distutils.cleanup()
             print("Done!")
 
+        if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read():
+            print("\nSelf-canceling SLURM job in 32s", JOB_ID)
+            os.popen(f"sleep 32 && scancel {JOB_ID}")
+
         if runner and runner.trainer and runner.trainer.logger:
             runner.trainer.logger.finish(error or signal)
-
-        if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read():
-            print("\nSelf-canceling SLURM job", JOB_ID)
-            os.system(f"scancel {JOB_ID}")

From 63992210d89539400c1834820ee165f9b9907775 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 14 Jan 2023 20:16:58 -0500
Subject: [PATCH 124/273] update proj

---
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index a33b8bb704..aea67fb2d8 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -7,7 +7,7 @@ job:
   partition: long
 
 default:
-  wandb_project: ocp-4
+  wandb_project: ocp-qm
   config: fanet-qm9-all
   mode: train
   test_ri: true

From e82bc876493d520a2acfe591346374ab50786a96 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Sun, 15 Jan 2023 12:26:45 -0500
Subject: [PATCH 125/273] config orion 2

---
 .../exps/icml/is2re-all/fanet-orion-2.yaml    | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 configs/exps/icml/is2re-all/fanet-orion-2.yaml

diff --git a/configs/exps/icml/is2re-all/fanet-orion-2.yaml b/configs/exps/icml/is2re-all/fanet-orion-2.yaml
new file mode 100644
index 0000000000..a9f9a03370
--- /dev/null
+++ b/configs/exps/icml/is2re-all/fanet-orion-2.yaml
@@ -0,0 +1,60 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  time: 10:00:00
+  partition: long
+
+default:
+  wandb_project: ocp-3
+  config: fanet-is2re-all
+  mode: train
+  test_ri: true
+  wandb_tags: is2re-all, orion-2
+  cp_data_to_tmpdir: true
+  graph_rewiring: remove-tag-0
+  model:
+    edge_embed_type: all_rij
+    graph_norm: True
+    weighted-av-final-embeds: True
+  frame_averaging: 2D
+  fa_frames: random
+  max_epochs_fidelity: 30
+  optim:
+    scheduler: LinearWarmupCosineAnnealingLR
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co
+    optim: lr_initial, warmup_steps
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 216
+
+  unique_exp_name: fanet-is2re-all-v1
+
+  space:
+    optim/max_epochs: fidelity(15, 30, base=6)
+    optim/lr_initial: loguniform(6e-4, 4e-3, precision=2)
+    model/hidden_channels: uniform(8, 19, discrete=True)
+    model/mp_type: choices(["base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "updown_local_env"])
+    model/num_filters: uniform(3, 18, discrete=True)
+    model/num_gaussians: uniform(50, 170, discrete=True)
+    model/num_interactions: uniform(3, 7, discrete=True)
+    model/pg_hidden_channels: uniform(0, 2, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/tag_hidden_channels: uniform(0, 3, discrete=True)
+    model/complex_mp: choices([True, False])
+    model/att_heads: choices([1,3,6])
+    model/second_layer_MLP: choices([True, False])
+    model/skip_co: choices(["add", "concat", False])
+    model/cutoff: choices([4.0, 6.0, 10.0])
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2

From 29090f25cf8fb534fe58859e6c79e8b4d6643925 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 12:26:55 -0500
Subject: [PATCH 126/273] set `fidelity_max_epochs` auto and use that for steps

---
 main.py                            | 14 +++++---
 ocpmodels/common/utils.py          | 11 ++++--
 ocpmodels/trainers/base_trainer.py | 56 ++++++++++++++++++++++++------
 3 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/main.py b/main.py
index 1620005e9f..4d0a29dba8 100644
--- a/main.py
+++ b/main.py
@@ -34,6 +34,7 @@
     setup_logging,
     unflatten_dict,
     update_from_sbatch_py_vars,
+    set_max_fidelity,
 )
 from ocpmodels.trainers import BaseTrainer
 
@@ -77,13 +78,16 @@ def run(self, orion_exp=None):
         if distutils.is_master():
             if orion_exp:
                 orion_trial = orion_exp.suggest(1)
-                self.hparams = unflatten_dict(
-                    apply_mult_factor(
-                        orion_trial.params,
-                        self.trainer_config.get("orion_mult_factor"),
+                self.hparams = set_max_fidelity(
+                    unflatten_dict(
+                        apply_mult_factor(
+                            orion_trial.params,
+                            self.trainer_config.get("orion_mult_factor"),
+                            sep="/",
+                        ),
                         sep="/",
                     ),
-                    sep="/",
+                    orion_exp,
                 )
                 self.hparams["orion_hash_params"] = orion_trial.hash_params
                 self.hparams["orion_unique_exp_name"] = orion_exp.name
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 241343d984..d13d8a484f 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -67,6 +67,13 @@ def __getattr__(self, k: str):
 JOB_ID = os.environ.get("SLURM_JOB_ID")
 
 
+def set_max_fidelity(hparams, orion_exp):
+    for p, prior in orion_exp.space.items():
+        if prior.type == "fidelity":
+            hparams[f"fidelity_{p}"] = prior.high
+    return hparams
+
+
 def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."):
     """
     Multiplies all values of orion_hparams listed in mult_factor_dict["targets"]
@@ -169,9 +176,6 @@ def continue_orion_exp(trainer_config):
     base_dir = Path(trainer_config["run_dir"]).parent
     existing_id_files = list(base_dir.glob(f"*/{id_file}"))
 
-    if not existing_id_files:
-        return trainer_config
-
     latest_dirs = sorted(
         [
             f.parent
@@ -182,6 +186,7 @@ def continue_orion_exp(trainer_config):
     )
 
     if not latest_dirs:
+        print("\n😅 No previous Orion trial matched for unique file: ", id_file)
         return trainer_config
 
     resume_dir = latest_dirs[-1]
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 357c6e1f40..ebe3b6a8bb 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -221,8 +221,29 @@ def load_datasets(self):
 
         transform = get_transforms(self.config)  # TODO: train/val/test behavior
         batch_size = self.config["optim"]["batch_size"]
-        max_steps = self.config["optim"].get("max_steps", -1)
-        max_samples = self.config["optim"].get("max_samples", -1)
+        epochs_key = (
+            "max_epochs"
+            if "fidelity_max_epochs" not in self.config["optim"]
+            else "fidelity_max_epochs"
+        )
+        steps_key = (
+            "max_steps"
+            if "fidelity_max_steps" not in self.config["optim"]
+            else "fidelity_max_steps"
+        )
+        samples_key = (
+            "max_samples"
+            if "fidelity_max_samples" not in self.config["optim"]
+            else "fidelity_max_samples"
+        )
+        max_epochs = self.config["optim"].get(epochs_key, -1)
+        max_steps = self.config["optim"].get(steps_key, -1)
+        max_samples = self.config["optim"].get(samples_key, -1)
+        print("Optim config auto update:")
+        for k, v in zip(
+            [epochs_key, steps_key, samples_key], [max_epochs, max_steps, max_samples]
+        ):
+            print(f"  • {k}: {v}")
 
         for split, ds_conf in self.config["dataset"].items():
             if split == "default_val":
@@ -235,43 +256,49 @@ def load_datasets(self):
             shuffle = False
             if split == "train":
                 shuffle = True
+                n_train = len(self.datasets[split])
                 if max_samples > 0:
-                    if self.config["optim"].get("max_epochs", -1) > 0:
+                    if max_epochs > 0:
                         print(
                             "\nWARNING: Both max_samples and max_epochs are set.",
                             "Using max_samples.",
                         )
-                    if self.config["optim"].get("max_steps", -1) > 0:
+                    if max_steps > 0:
                         print(
                             "WARNING: Both max_samples and max_steps are set.",
                             "Using max_samples.\n",
                         )
                     self.config["optim"]["max_epochs"] = int(
-                        np.ceil(max_samples / len(self.datasets[split]))
+                        np.ceil(max_samples / n_train)
                     )
                     self.config["optim"]["max_steps"] = int(
                         np.ceil(max_samples / batch_size)
                     )
                 elif max_steps > 0:
-                    if self.config["optim"].get("max_epochs", -1) > 0:
+                    if max_epochs > 0:
                         print(
                             "\nWARNING: Both max_steps and max_epochs are set.",
                             "Using max_steps.\n",
                         )
                     self.config["optim"]["max_epochs"] = int(
-                        np.ceil(max_steps / (len(self.datasets[split]) / batch_size))
+                        np.ceil(max_steps / (n_train / batch_size))
                     )
                     print(
                         "Setting max_epochs to",
                         self.config["optim"]["max_epochs"],
                         f"from max_steps ({max_steps}),",
-                        f"dataset length ({len(self.datasets[split])}),",
+                        f"dataset length ({n_train}),",
                         f"and batch_size ({batch_size})\n",
                     )
                 else:
                     self.config["optim"]["max_steps"] = int(
-                        self.config["optim"]["max_epochs"]
-                        * (len(self.datasets[split]) / batch_size)
+                        np.ceil(max_epochs * (n_train / batch_size))
+                    )
+                    print(
+                        "Setting max_steps to ",
+                        f"{self.config['optim']['max_steps']} from",
+                        f"max_epochs ({max_epochs}), dataset length",
+                        f"({n_train}), and batch_size ({batch_size})\n",
                     )
 
             self.samplers[split] = self.get_sampler(
@@ -383,6 +410,12 @@ def load_checkpoint(self, checkpoint_path):
             self.optimizer.load_state_dict(checkpoint["optimizer"])
         if "scheduler" in checkpoint and checkpoint["scheduler"] is not None:
             self.scheduler.scheduler.load_state_dict(checkpoint["scheduler"])
+        if checkpoint.get("warmup_scheduler") is not None and hasattr(
+            self.scheduler, "warmup_scheduler"
+        ):
+            self.scheduler.warmup_scheduler.load_state_dict(
+                checkpoint["warmup_scheduler"]
+            )
         if "ema" in checkpoint and checkpoint["ema"] is not None:
             self.ema.load_state_dict(checkpoint["ema"])
         else:
@@ -484,6 +517,9 @@ def save(
                         "scheduler": self.scheduler.scheduler.state_dict()
                         if self.scheduler.scheduler_type != "Null"
                         else None,
+                        "warmup_scheduler": self.scheduler.warmup_scheduler.state_dict()
+                        if hasattr(self.scheduler, "warmup_scheduler")
+                        else None,
                         "normalizers": {
                             key: value.state_dict()
                             for key, value in self.normalizers.items()

From a5ce1278cc87af7b038256e5909623d7c323f58f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 12:27:25 -0500
Subject: [PATCH 127/273] update drac paths

---
 configs/models/tasks/_drac.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/models/tasks/_drac.yaml b/configs/models/tasks/_drac.yaml
index f623e4d860..bbfa6a9847 100644
--- a/configs/models/tasks/_drac.yaml
+++ b/configs/models/tasks/_drac.yaml
@@ -1,7 +1,7 @@
 # this file overrides paths for data on drac clusters
 drac_base_path:
   narval: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data
-  beluga: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data
+  beluga: /scratch/vsch/ocp-data
 
 is2re:
   10k:

From db31ce406cf9d79a7b45b3c6ae626c63dde16ae3 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 12:36:46 -0500
Subject: [PATCH 128/273] imrpove `T_max` setting

---
 ocpmodels/common/utils.py          |  8 +++---
 ocpmodels/modules/scheduler.py     |  6 ++++-
 ocpmodels/trainers/base_trainer.py | 41 +++++++++++++-----------------
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index d13d8a484f..f5c38cdc3c 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -666,9 +666,11 @@ def warmup_lr_lambda(current_step, optim_config):
         # exponential decay per step
         assert "decay_rate" in optim_config, "decay_rate must be defined in optim"
         ds = optim_config["decay_steps"]
-        if ds == "max_steps":
-            assert "max_steps" in optim_config, "max_steps must be defined in optim"
-            ds = optim_config["max_steps"]
+        if isinstance(ds, str):
+            assert (
+                ds in optim_config
+            ), f"ds is {ds}, it must be defined in optim ({optim_config})"
+            ds = optim_config[ds]
 
         return optim_config["decay_rate"] ** (
             (current_step - optim_config["warmup_steps"]) / ds
diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index 8a4d082188..5207e2b0e3 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -44,11 +44,15 @@ def scheduler_lambda_fn(x):
             scheduler_args = self.filter_kwargs(self.optim_config)
             self.scheduler = self.scheduler(optimizer, **scheduler_args)
         elif self.scheduler_type == "LinearWarmupCosineAnnealingLR":
+            T_max = (
+                self.optim_config.get("fidelity_max_steps")
+                or self.optim_config["max_steps"]
+            )
             self.warmup_scheduler = warmup.ExponentialWarmup(
                 self.optimizer, warmup_period=self.optim_config["warmup_steps"]
             )
             self.scheduler = lr_scheduler.CosineAnnealingLR(
-                self.optimizer, T_max=self.optim_config["max_steps"], eta_min=1e-7
+                self.optimizer, T_max=T_max, eta_min=1e-7
             )
 
     def step(self, metrics=None, epoch=None):
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index ebe3b6a8bb..b2fa37d875 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -221,29 +221,10 @@ def load_datasets(self):
 
         transform = get_transforms(self.config)  # TODO: train/val/test behavior
         batch_size = self.config["optim"]["batch_size"]
-        epochs_key = (
-            "max_epochs"
-            if "fidelity_max_epochs" not in self.config["optim"]
-            else "fidelity_max_epochs"
-        )
-        steps_key = (
-            "max_steps"
-            if "fidelity_max_steps" not in self.config["optim"]
-            else "fidelity_max_steps"
-        )
-        samples_key = (
-            "max_samples"
-            if "fidelity_max_samples" not in self.config["optim"]
-            else "fidelity_max_samples"
-        )
-        max_epochs = self.config["optim"].get(epochs_key, -1)
-        max_steps = self.config["optim"].get(steps_key, -1)
-        max_samples = self.config["optim"].get(samples_key, -1)
-        print("Optim config auto update:")
-        for k, v in zip(
-            [epochs_key, steps_key, samples_key], [max_epochs, max_steps, max_samples]
-        ):
-            print(f"  • {k}: {v}")
+
+        max_epochs = self.config["optim"].get("max_epochs", -1)
+        max_steps = self.config["optim"].get("max_steps", -1)
+        max_samples = self.config["optim"].get("max_samples", -1)
 
         for split, ds_conf in self.config["dataset"].items():
             if split == "default_val":
@@ -257,6 +238,20 @@ def load_datasets(self):
             if split == "train":
                 shuffle = True
                 n_train = len(self.datasets[split])
+
+                if "fidelity_max_epochs" in self.config["optim"]:
+                    self.config["optim"]["fidelity_max_steps"] = int(
+                        np.ceil(
+                            self.config["optim"]["fidelity_max_epochs"]
+                            * (n_train / batch_size)
+                        )
+                    )
+                    print(
+                        "Setting fidelity_max_steps to {}".format(
+                            self.config["optim"]["fidelity_max_steps"]
+                        )
+                    )
+
                 if max_samples > 0:
                     if max_epochs > 0:
                         print(

From 649f09d844de23e23fc049402d1743d668768c93 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:06:56 -0500
Subject: [PATCH 129/273] store all job ids

---
 ocpmodels/common/logger.py         | 3 +++
 ocpmodels/common/utils.py          | 1 +
 ocpmodels/trainers/base_trainer.py | 4 ++++
 3 files changed, 8 insertions(+)

diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index 0a3cddfb5b..b3eaec103b 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -95,6 +95,7 @@ def __init__(self, trainer_config):
 
         if trainer_config.get("wandb_resume_id"):
             wandb_id = trainer_config["wandb_resume_id"]
+            print("⛑ Resuming wandb run: ", wandb_id)
         else:
             wandb_id = str(self.trainer_config.get("wandb_id", ""))
             if wandb_id:
@@ -110,6 +111,8 @@ def __init__(self, trainer_config):
             note = self.trainer_config.get("note", "")
             name = self.trainer_config["wandb_name"] or wandb_id
 
+        print("Initializing wandb run: ", wandb_id, "with name: ", name)
+
         self.run = wandb.init(
             config=self.trainer_config,
             id=wandb_id,
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index f5c38cdc3c..d01d2d761e 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1011,6 +1011,7 @@ def build_config(args, args_override):
     config["run_dir"] = resolve(config["run_dir"])
     config["slurm"] = {}
     config["job_id"] = JOB_ID or "no-job-id"
+    config["job_ids"] = JOB_ID or "no-job-id"
     config["cluster_name"] = CLUSTER.name
 
     if "regress_forces" in config["model"]:
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index b2fa37d875..2a07f85d3f 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -422,6 +422,10 @@ def load_checkpoint(self, checkpoint_path):
             if self.scaler and checkpoint["amp"]:
                 self.scaler.load_state_dict(checkpoint["amp"])
 
+        if "config" in checkpoint:
+            if "job_ids" in checkpoint["config"]:
+                self.config["job_ids"] = checkpoint["config"]["job_ids"] + f", {JOB_ID}"
+
     def load_loss(self):
         self.loss_fn = {}
         self.loss_fn["energy"] = self.config["optim"].get("loss_energy", "mae")

From b11a1019e6d0967bb745499ebb484f9db483d59d Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:09:43 -0500
Subject: [PATCH 130/273] print which T_max is chosen

---
 ocpmodels/modules/scheduler.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index 5207e2b0e3..e1c203059a 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -44,10 +44,13 @@ def scheduler_lambda_fn(x):
             scheduler_args = self.filter_kwargs(self.optim_config)
             self.scheduler = self.scheduler(optimizer, **scheduler_args)
         elif self.scheduler_type == "LinearWarmupCosineAnnealingLR":
-            T_max = (
-                self.optim_config.get("fidelity_max_steps")
-                or self.optim_config["max_steps"]
-            )
+            T_max = self.optim_config.get("fidelity_max_steps")
+            if T_max is None:
+                T_max = self.optim_config["max_steps"]
+                print(f"Using max_steps for scheduler -> {T_max}")
+            else:
+                print(f"Using fidelity_max_steps for scheduler -> {T_max}")
+
             self.warmup_scheduler = warmup.ExponentialWarmup(
                 self.optimizer, warmup_period=self.optim_config["warmup_steps"]
             )

From d5701a72adc2683257cfaaccc4bae8ee1abaae79 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:13:32 -0500
Subject: [PATCH 131/273] update `schmidtv` defaults

---
 configs/sbatch/schmidtv.yaml | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/configs/sbatch/schmidtv.yaml b/configs/sbatch/schmidtv.yaml
index 942619513f..5f6dbfdf49 100644
--- a/configs/sbatch/schmidtv.yaml
+++ b/configs/sbatch/schmidtv.yaml
@@ -1,9 +1,5 @@
 # Overwrites defaults.yaml for user `schmidtv`.
 # Create your own $USER.yaml in order to overwrite defaults.yaml systematically to your own taste.
 
-cpus: 8
-mem: 128GB
-env: ocp-env
-gres: gpu:rtx8000:4
-partition: long
+env: ocp-a100
 modules: anaconda/3

From 985f07d47ac53afe3644e590d3e886cb68035bed Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:22:00 -0500
Subject: [PATCH 132/273] fix `set_max_fidelity`

---
 configs/exps/icml/is2re-10k/fanet-orion.yaml | 2 +-
 ocpmodels/common/utils.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/exps/icml/is2re-10k/fanet-orion.yaml b/configs/exps/icml/is2re-10k/fanet-orion.yaml
index 41d5fd07b1..4dd2b528b7 100644
--- a/configs/exps/icml/is2re-10k/fanet-orion.yaml
+++ b/configs/exps/icml/is2re-10k/fanet-orion.yaml
@@ -4,7 +4,7 @@ job:
   cpus: 4
   gres: gpu:1
   time: 30:00
-  partition: long
+  partition: main
   # code_loc: /home/mila/s/schmidtv/ocp-project/ocp-drlab
   # env: ocp-a100
 
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index d01d2d761e..bcf5e5627e 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -70,7 +70,7 @@ def __getattr__(self, k: str):
 def set_max_fidelity(hparams, orion_exp):
     for p, prior in orion_exp.space.items():
         if prior.type == "fidelity":
-            hparams[f"fidelity_{p}"] = prior.high
+            hparams[f"fidelity_{p.split('/')[-1]}"] = prior.high
     return hparams
 
 
From b3c3e61c22099a5f4d54d02bf5f1a57a9ce32c85 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:26:23 -0500
Subject: [PATCH 133/273] nested `set_max_fidelity`

---
 ocpmodels/common/utils.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index bcf5e5627e..ff8e35e4b9 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -70,7 +70,15 @@ def __getattr__(self, k: str):
 def set_max_fidelity(hparams, orion_exp):
     for p, prior in orion_exp.space.items():
         if prior.type == "fidelity":
-            hparams[f"fidelity_{p.split('/')[-1]}"] = prior.high
+            keys = p.split("/")
+            if len(keys) == 1:
+                hparams[f"fidelity_{p}"] = prior.high
+            elif len(keys) == 2:
+                if keys[0] not in hparams:
+                    hparams[keys[0]] = {}
+                hparams[keys[0]][f"fidelity_{keys[1]}"] = prior.high
+            else:
+                print("Error: fidelity parameters must be at most 2 levels deep.")
     return hparams
 
 
From 9486321ea8995770ffe55c9524c18aca43264a4e Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:31:08 -0500
Subject: [PATCH 134/273] remove `max_epochs_fidelity`

---
 configs/exps/icml/is2re-all/fanet-orion-2.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/configs/exps/icml/is2re-all/fanet-orion-2.yaml b/configs/exps/icml/is2re-all/fanet-orion-2.yaml
index a9f9a03370..ae26af2e9e 100644
--- a/configs/exps/icml/is2re-all/fanet-orion-2.yaml
+++ b/configs/exps/icml/is2re-all/fanet-orion-2.yaml
@@ -20,7 +20,6 @@ default:
     weighted-av-final-embeds: True
   frame_averaging: 2D
   fa_frames: random
-  max_epochs_fidelity: 30
   optim:
     scheduler: LinearWarmupCosineAnnealingLR
   note:

From 126f8ae852b331797a4f0dea912d69cb7d9610e3 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:36:51 -0500
Subject: [PATCH 135/273] fix exp manager with variable db path

---
 ocpmodels/common/exp_manager.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index c5321075df..a52793f307 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -214,10 +214,6 @@ def help(self):
         "help": False,
         "name": None,
         "wandb_path": None,
-        "orion_db_path": str(
-            Path(__file__).resolve().parent.parent.parent
-            / "data/orion/storage/orion_db.pkl"
-        ),
         "watch": -1,
     }
     args = resolved_args(defaults=defaults)
@@ -248,10 +244,14 @@ def help(self):
         "💃 Status of experiment",
         f"'{args.name}' and wandb entity/project '{args.wandb_path}':",
     )
+    orion_db_path = str(
+        Path(__file__).resolve().parent.parent.parent
+        / f"data/orion/storage/{args.name}_db.pkl"
+    )
     m = Manager(
         name=args.name,
         wandb_path=args.wandb_path,
-        orion_db_path=args.orion_db_path,
+        orion_db_path=orion_db_path,
     )
 
     m.print_wandb_query()

From 5b100ce0b15d84e3e2e9869cf201ba2f62592224 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:37:57 -0500
Subject: [PATCH 136/273] clean prints

---
 ocpmodels/common/exp_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index a52793f307..27ceb86817 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -254,9 +254,9 @@ def help(self):
         orion_db_path=orion_db_path,
     )
 
-    m.print_wandb_query()
-    exp_df = m.exp.to_pandas()
-    reserved_wandbs = m.get_reserved_wandb_runs()
+    # m.print_wandb_query()
+    # exp_df = m.exp.to_pandas()
+    # reserved_wandbs = m.get_reserved_wandb_runs()
 
     if args.watch and args.watch > 0:
         if args.watch < 15:

From e4a209844edee3da342b494d90b49cd9d670a167 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 17:59:03 -0500
Subject: [PATCH 137/273] update

---
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 8 ++++----
 ocpmodels/common/exp_manager.py            | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index aea67fb2d8..983e67f951 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -36,10 +36,10 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-qm9-v1.0.1
+  unique_exp_name: fanet-qm9-v1.0.2
 
   space:
-    optim/max_epochs: fidelity(30, 300, base=6)
+    optim/max_epochs: fidelity(50, 300, base=6)
     optim/lr_initial: loguniform(1e-4, 5e-3, precision=2)
     model/graph_norm: choices([True, False])
     model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
@@ -47,8 +47,8 @@ orion:
     model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
     model/num_filters: uniform(3, 16, discrete=True)
     model/num_gaussians: uniform(20, 150, discrete=True)
-    model/num_interactions: uniform(1, 7, discrete=True)
-    model/pg_hidden_channels: uniform(0, 2, discrete=True)
+    model/num_interactions: uniform(2, 7, discrete=True)
+    model/pg_hidden_channels: uniform(0, 1, discrete=True)
     model/phys_embeds: choices([True, False])
   algorithms:
     asha:
diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 27ceb86817..be8ab8f307 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -1,7 +1,7 @@
+import wandb
 from orion.client import get_experiment
 from pathlib import Path
 from collections import defaultdict, Counter
-import wandb
 from textwrap import dedent
 from minydra import resolved_args
 import os

From eb05b524fe8984aad4608b6e51c1033a35ddcf7f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 18:40:38 -0500
Subject: [PATCH 138/273] update exp name

---
 configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml b/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml
index 58c5c64538..efbe7ec3d3 100644
--- a/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml
+++ b/configs/exps/icml/s2ef/fanet-orion-s2ef-1.yaml
@@ -34,7 +34,7 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 50
 
-  unique_exp_name: fanet-s2ef-2M-v1
+  unique_exp_name: fanet-s2ef-2M-v1.1
 
   space:
     model/att_heads: choices([1,2,3,4])

From 027c8505a181f661490fcf804fefa437244e446a Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 22:21:12 -0500
Subject: [PATCH 139/273] `distutils` rename to `dist_utils` not to conflict
 with stdlib

---
 main.py                                       | 72 +++++++++++--------
 ocpmodels/common/data_parallel.py             |  4 +-
 .../common/{distutils.py => dist_utils.py}    |  0
 ocpmodels/common/exp_manager.py               |  7 +-
 ocpmodels/modules/loss.py                     |  6 +-
 ocpmodels/trainers/base_trainer.py            | 50 ++++++-------
 ocpmodels/trainers/energy_trainer.py          | 10 +--
 ocpmodels/trainers/forces_trainer.py          | 26 +++----
 ocpmodels/trainers/single_trainer.py          | 28 ++++----
 9 files changed, 111 insertions(+), 92 deletions(-)
 rename ocpmodels/common/{distutils.py => dist_utils.py} (100%)

diff --git a/main.py b/main.py
index 4d0a29dba8..d7505e383f 100644
--- a/main.py
+++ b/main.py
@@ -8,20 +8,22 @@
 import copy
 import logging
 import os
+import shutil
 import time
 import traceback
 import warnings
 
 import torch
+from orion.core.utils.exceptions import ReservationRaceCondition
 from yaml import dump
 
-from ocpmodels.common import distutils
+from ocpmodels.common import dist_utils
 from ocpmodels.common.flags import flags
 from ocpmodels.common.registry import registry
 from ocpmodels.common.utils import (
     JOB_ID,
-    auto_note,
     apply_mult_factor,
+    auto_note,
     build_config,
     continue_from_slurm_job_id,
     continue_orion_exp,
@@ -30,11 +32,11 @@
     move_lmdb_data_to_slurm_tmpdir,
     read_slurm_env,
     resolve,
+    set_max_fidelity,
     setup_imports,
     setup_logging,
     unflatten_dict,
     update_from_sbatch_py_vars,
-    set_max_fidelity,
 )
 from ocpmodels.trainers import BaseTrainer
 
@@ -75,30 +77,42 @@ def __init__(self, trainer_config):
     def run(self, orion_exp=None):
         orion_trial = None
         self.original_config = copy.deepcopy(self.trainer_config)
-        if distutils.is_master():
+        orion_race_condition = False
+        if dist_utils.is_master():
             if orion_exp:
-                orion_trial = orion_exp.suggest(1)
-                self.hparams = set_max_fidelity(
-                    unflatten_dict(
-                        apply_mult_factor(
-                            orion_trial.params,
-                            self.trainer_config.get("orion_mult_factor"),
+                try:
+                    orion_trial = orion_exp.suggest(1)
+                    print(
+                        "\n🚨  Orion reservation race condition detected. Exiting",
+                        "and deleting run dir",
+                    )
+                    self.hparams = set_max_fidelity(
+                        unflatten_dict(
+                            apply_mult_factor(
+                                orion_trial.params,
+                                self.trainer_config.get("orion_mult_factor"),
+                                sep="/",
+                            ),
                             sep="/",
                         ),
-                        sep="/",
-                    ),
-                    orion_exp,
-                )
-                self.hparams["orion_hash_params"] = orion_trial.hash_params
-                self.hparams["orion_unique_exp_name"] = orion_exp.name
-
-        should_be_0 = distutils.get_rank()
-        hp_list = [self.hparams, should_be_0]
+                        orion_exp,
+                    )
+                    self.hparams["orion_hash_params"] = orion_trial.hash_params
+                    self.hparams["orion_unique_exp_name"] = orion_exp.name
+                except ReservationRaceCondition:
+                    orion_race_condition = True
+
+        should_be_0 = dist_utils.get_rank()
+        hp_list = [self.hparams, should_be_0, orion_race_condition]
         # print("hparams pre-broadcast: ", hparams)
-        distutils.broadcast_object_list(hp_list)
-        self.hparams, should_be_0 = hp_list
+        dist_utils.broadcast_object_list(hp_list)
+        self.hparams, should_be_0, orion_race_condition = hp_list
         # print("hparams post-broadcast: ", hparams)
         assert should_be_0 == 0
+        if orion_race_condition:
+            if dist_utils.is_master():
+                shutil.rmtree(self.trainer_config["run_dir"])
+            return
         if self.hparams:
             print("\n💎 Received hyper-parameters from Orion:")
             print(dump(self.hparams), end="\n")
@@ -120,7 +134,7 @@ def run(self, orion_exp=None):
             print("\nJob was preempted. Wrapping up...\n")
             self.trainer.close_datasets()
 
-        distutils.synchronize()
+        dist_utils.synchronize()
         logging.info(f"Total time taken: {time.time() - start_time}")
         if self.trainer.logger is not None:
             self.trainer.logger.log({"Total time": time.time() - start_time})
@@ -128,7 +142,7 @@ def run(self, orion_exp=None):
         objective = self.trainer.objective
         # print("objective pre-broadcast: ", objective)
         o_list = [objective]
-        distutils.broadcast_object_list(o_list)
+        dist_utils.broadcast_object_list(o_list)
         objective = o_list[0]
         # print("objective post-broadcast: ", objective)
 
@@ -162,12 +176,12 @@ def run(self, orion_exp=None):
     original_trainer_config = copy.deepcopy(trainer_config)
 
     if args.distributed:
-        distutils.setup(trainer_config)
+        dist_utils.setup(trainer_config)
         print("Distributed backend setup.")
 
-    if distutils.is_master():
+    if dist_utils.is_master():
         trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config)
-        # distutils.synchronize()
+        # dist_utils.synchronize()
 
     # -------------------
     # -----  Setup  -----
@@ -183,7 +197,7 @@ def run(self, orion_exp=None):
         # -------------------
         # -----  Train  -----
         # -------------------
-        if args.orion_exp_config_path and distutils.is_master():
+        if args.orion_exp_config_path and dist_utils.is_master():
             experiment = load_orion_exp(args)
             print("\nStarting runner.")
             runner.run(orion_exp=experiment)
@@ -198,10 +212,10 @@ def run(self, orion_exp=None):
     finally:
         if args.distributed:
             print(
-                "\nWaiting for all processes to finish with distutils.cleanup()...",
+                "\nWaiting for all processes to finish with dist_utils.cleanup()...",
                 end="",
             )
-            distutils.cleanup()
+            dist_utils.cleanup()
             print("Done!")
 
         if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read():
diff --git a/ocpmodels/common/data_parallel.py b/ocpmodels/common/data_parallel.py
index b66b90ede2..9c57b6bc71 100644
--- a/ocpmodels/common/data_parallel.py
+++ b/ocpmodels/common/data_parallel.py
@@ -14,7 +14,7 @@
 import torch
 from torch.utils.data import BatchSampler, DistributedSampler, Sampler
 
-from ocpmodels.common import distutils
+from ocpmodels.common import dist_utils
 from ocpmodels.datasets import data_list_collater
 
 
@@ -216,7 +216,7 @@ def __iter__(self):
                     sizes = [self.sizes[idx] for idx in batch_idx]
 
                 idx_sizes = torch.stack([torch.tensor(batch_idx), torch.tensor(sizes)])
-                idx_sizes_all = distutils.all_gather(idx_sizes, device=self.device)
+                idx_sizes_all = dist_utils.all_gather(idx_sizes, device=self.device)
                 idx_sizes_all = torch.cat(idx_sizes_all, dim=-1).cpu()
                 idx_all = idx_sizes_all[0]
                 sizes_all = idx_sizes_all[1]
diff --git a/ocpmodels/common/distutils.py b/ocpmodels/common/dist_utils.py
similarity index 100%
rename from ocpmodels/common/distutils.py
rename to ocpmodels/common/dist_utils.py
diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index be8ab8f307..44e56ea0fe 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -105,10 +105,15 @@ def print_status(self):
         )
         print("{:32} : {:4}".format("Existing wandb runs", len(self.wandb_runs)))
         print("{:32} : {}".format("Algorithm's budgets", str(self.budgets)))
+        sq_cmd = (
+            "/opt/slurm/bin/squeue"
+            if "CC_CUSTER" not in os.environ
+            else "/opt/software/slurm/bin/squeue"
+        )
         sq = set(
             [
                 j.strip()
-                for j in os.popen("/opt/slurm/bin/squeue -u $USER -o '%12i'")
+                for j in os.popen(f"{sq_cmd} -u $USER -o '%12i'")
                 .read()
                 .splitlines()[1:]
             ]
diff --git a/ocpmodels/modules/loss.py b/ocpmodels/modules/loss.py
index 42122b5d07..cb305f0759 100644
--- a/ocpmodels/modules/loss.py
+++ b/ocpmodels/modules/loss.py
@@ -1,7 +1,7 @@
 import torch
 from torch import nn
 
-from ocpmodels.common import distutils
+from ocpmodels.common import dist_utils
 
 
 class L2MAELoss(nn.Module):
@@ -30,9 +30,9 @@ def forward(self, input: torch.Tensor, target: torch.Tensor):
         loss = self.loss_fn(input, target)
         if self.reduction == "mean":
             num_samples = input.shape[0]
-            num_samples = distutils.all_reduce(num_samples, device=input.device)
+            num_samples = dist_utils.all_reduce(num_samples, device=input.device)
             # Multiply by world size since gradients are averaged
             # across DDP replicas
-            return loss * distutils.get_world_size() / num_samples
+            return loss * dist_utils.get_world_size() / num_samples
         else:
             return loss
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 2a07f85d3f..52adcbdf9a 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -27,7 +27,7 @@
 from torch_geometric.data import Batch
 from tqdm import tqdm
 
-from ocpmodels.common import distutils
+from ocpmodels.common import dist_utils
 from ocpmodels.common.data_parallel import (
     BalancedBatchSampler,
     OCPDataParallel,
@@ -58,7 +58,7 @@ def __init__(self, **kwargs):
         self.config = {
             **kwargs,
             "model_name": model_name,
-            "gpus": distutils.get_world_size() if not kwargs["cpu"] else 0,
+            "gpus": dist_utils.get_world_size() if not kwargs["cpu"] else 0,
             "commit": get_commit_hash(),
             "checkpoint_dir": str(Path(run_dir) / "checkpoints"),
             "results_dir": str(Path(run_dir) / "results"),
@@ -94,7 +94,7 @@ def __init__(self, **kwargs):
 
         timestamp = torch.tensor(datetime.datetime.now().timestamp()).to(self.device)
         # create directories from master rank only
-        distutils.broadcast(timestamp, 0)
+        dist_utils.broadcast(timestamp, 0)
         timestamp = datetime.datetime.fromtimestamp(timestamp.int()).strftime(
             "%Y-%m-%d-%H-%M-%S"
         )
@@ -121,7 +121,7 @@ def __init__(self, **kwargs):
         ):
             self.normalizer = self.config["dataset"]["train"]
 
-        if not self.is_debug and distutils.is_master() and not self.is_hpo:
+        if not self.is_debug and dist_utils.is_master() and not self.is_hpo:
             os.makedirs(self.config["checkpoint_dir"], exist_ok=True)
             os.makedirs(self.config["results_dir"], exist_ok=True)
             os.makedirs(self.config["logs_dir"], exist_ok=True)
@@ -138,7 +138,7 @@ def __init__(self, **kwargs):
             # default is no checkpointing
             self.hpo_checkpoint_every = self.config["optim"].get("checkpoint_every", -1)
 
-        if distutils.is_master() and not self.silent:
+        if dist_utils.is_master() and not self.silent:
             print("🧰 Trainer config:")
             print(yaml.dump(self.config), end="\n\n")
         self.load()
@@ -174,7 +174,7 @@ def load_seed_from_config(self):
 
     def load_logger(self):
         self.logger = None
-        if not self.is_debug and distutils.is_master() and not self.is_hpo:
+        if not self.is_debug and dist_utils.is_master() and not self.is_hpo:
             assert self.config["logger"] is not None, "Specify logger in config"
 
             logger = self.config["logger"]
@@ -194,8 +194,8 @@ def get_sampler(self, dataset, batch_size, shuffle):
         sampler = BalancedBatchSampler(
             dataset,
             batch_size=batch_size,
-            num_replicas=distutils.get_world_size(),
-            rank=distutils.get_rank(),
+            num_replicas=dist_utils.get_world_size(),
+            rank=dist_utils.get_rank(),
             device=self.device,
             mode=balancing_mode,
             shuffle=shuffle,
@@ -355,7 +355,7 @@ def load_model(self):
             **self.config["model"],
         ).to(self.device)
 
-        if distutils.is_master() and not self.silent:
+        if dist_utils.is_master() and not self.silent:
             logging.info(
                 f"Loaded {self.model.__class__.__name__} with "
                 f"{self.model.num_params} parameters."
@@ -369,7 +369,7 @@ def load_model(self):
             output_device=self.device,
             num_gpus=1 if not self.cpu else 0,
         )
-        if distutils.initialized():
+        if dist_utils.initialized():
             self.model = DistributedDataParallel(
                 self.model, device_ids=[self.device], output_device=self.device
             )
@@ -390,12 +390,12 @@ def load_checkpoint(self, checkpoint_path):
         # if trained with ddp and want to load in non-ddp, modify keys from
         # module.module.. -> module..
         first_key = next(iter(checkpoint["state_dict"]))
-        if not distutils.initialized() and first_key.split(".")[1] == "module":
+        if not dist_utils.initialized() and first_key.split(".")[1] == "module":
             # No need for OrderedDict since dictionaries are technically ordered
             # since Python 3.6 and officially ordered since Python 3.7
             new_dict = {k[7:]: v for k, v in checkpoint["state_dict"].items()}
             self.model.load_state_dict(new_dict)
-        elif distutils.initialized() and first_key.split(".")[1] != "module":
+        elif dist_utils.initialized() and first_key.split(".")[1] != "module":
             new_dict = {f"module.{k}": v for k, v in checkpoint["state_dict"].items()}
             self.model.load_state_dict(new_dict)
         else:
@@ -439,7 +439,7 @@ def load_loss(self):
                 self.loss_fn[loss] = L2MAELoss()
             else:
                 raise NotImplementedError(f"Unknown loss function name: {loss_name}")
-            if distutils.initialized():
+            if dist_utils.initialized():
                 self.loss_fn[loss] = DDPLoss(self.loss_fn[loss])
 
     def load_optimizer(self):
@@ -505,7 +505,7 @@ def save(
         checkpoint_file="checkpoint.pt",
         training_state=True,
     ):
-        if not self.is_debug and distutils.is_master():
+        if not self.is_debug and dist_utils.is_master():
             if training_state:
                 save_checkpoint(
                     {
@@ -551,7 +551,7 @@ def save(
                 )
                 if self.ema:
                     self.ema.restore()
-        distutils.synchronize()
+        dist_utils.synchronize()
 
     def save_hpo(self, epoch, step, metrics, checkpoint_every):
         # default is no checkpointing
@@ -601,7 +601,7 @@ def validate(
         is_final=False,
         is_first=False,
     ):
-        if distutils.is_master() and not self.silent:
+        if dist_utils.is_master() and not self.silent:
             print()
             logging.info(f"🧐 Evaluating on {split}.")
         if self.is_hpo:
@@ -617,7 +617,7 @@ def validate(
             model_regresses_forces=self.config["model"].get("regress_forces", ""),
         )
         metrics = {}
-        desc = "device {}".format(distutils.get_rank())
+        desc = "device {}".format(dist_utils.get_rank())
 
         loader = self.loaders[split]
         times = Times(gpu=True)
@@ -651,10 +651,10 @@ def validate(
         aggregated_metrics = {}
         for k in metrics:
             aggregated_metrics[k] = {
-                "total": distutils.all_reduce(
+                "total": dist_utils.all_reduce(
                     metrics[k]["total"], average=False, device=self.device
                 ),
-                "numel": distutils.all_reduce(
+                "numel": dist_utils.all_reduce(
                     metrics[k]["numel"], average=False, device=self.device
                 ),
             }
@@ -670,7 +670,7 @@ def validate(
             log_dict["model_forward_time_mean"] = mean_val_times["model_forward"]
             log_dict["model_forward_time_std"] = std_val_times["model_forward"]
 
-        if distutils.is_master() and not self.silent:
+        if dist_utils.is_master() and not self.silent:
             log_str = ["{}: {:.4f}".format(k, v) for k, v in log_dict.items()]
             print("\n  > ".join([""] + log_str))
             print()
@@ -741,7 +741,7 @@ def save_results(self, predictions, results_file, keys):
 
         results_file_path = os.path.join(
             self.config["results_dir"],
-            f"{self.task_name}_{results_file}_{distutils.get_rank()}.npz",
+            f"{self.task_name}_{results_file}_{dist_utils.get_rank()}.npz",
         )
         np.savez_compressed(
             results_file_path,
@@ -749,15 +749,15 @@ def save_results(self, predictions, results_file, keys):
             **{key: predictions[key] for key in keys},
         )
 
-        distutils.synchronize()
-        if distutils.is_master():
+        dist_utils.synchronize()
+        if dist_utils.is_master():
             gather_results = defaultdict(list)
             full_path = os.path.join(
                 self.config["results_dir"],
                 f"{self.task_name}_{results_file}.npz",
             )
 
-            for i in range(distutils.get_world_size()):
+            for i in range(dist_utils.get_world_size()):
                 rank_path = os.path.join(
                     self.config["results_dir"],
                     f"{self.task_name}_{results_file}_{i}.npz",
@@ -843,7 +843,7 @@ def eval_all_splits(
         }
 
         # Log specific metrics
-        if final and self.config["logger"] == "wandb" and distutils.is_master():
+        if final and self.config["logger"] == "wandb" and dist_utils.is_master():
             overall_energy_mae = cumulated_energy_mae / len(all_splits)
             self.logger.log({"Eval time": cumulated_time})
             self.objective = overall_energy_mae
diff --git a/ocpmodels/trainers/energy_trainer.py b/ocpmodels/trainers/energy_trainer.py
index 40d09e5379..403dbece6f 100644
--- a/ocpmodels/trainers/energy_trainer.py
+++ b/ocpmodels/trainers/energy_trainer.py
@@ -13,7 +13,7 @@
 import torch_geometric
 from tqdm import tqdm
 
-from ocpmodels.common import distutils
+from ocpmodels.common import dist_utils
 from ocpmodels.common.registry import registry
 from ocpmodels.trainers.base_trainer import BaseTrainer
 
@@ -39,7 +39,7 @@ def load_task(self):
 
     @torch.no_grad()
     def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False):
-        if distutils.is_master() and not disable_tqdm:
+        if dist_utils.is_master() and not disable_tqdm:
             logging.info("Predicting on test.")
         assert isinstance(
             loader,
@@ -48,7 +48,7 @@ def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False)
                 torch_geometric.data.Batch,
             ),
         )
-        rank = distutils.get_rank()
+        rank = dist_utils.get_rank()
 
         if isinstance(loader, torch_geometric.data.Batch):
             loader = [[loader]]
@@ -320,9 +320,9 @@ def _log_metrics(self, end_of_epoch=False):
         )
         if (
             self.step % self.config["print_every"] == 0
-            and distutils.is_master()
+            and dist_utils.is_master()
             and not self.is_hpo
-        ) or (distutils.is_master() and end_of_epoch):
+        ) or (dist_utils.is_master() and end_of_epoch):
             log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()]
             if not self.silent:
                 print(", ".join(log_str))
diff --git a/ocpmodels/trainers/forces_trainer.py b/ocpmodels/trainers/forces_trainer.py
index d46724f412..b8b809ec3a 100644
--- a/ocpmodels/trainers/forces_trainer.py
+++ b/ocpmodels/trainers/forces_trainer.py
@@ -14,7 +14,7 @@
 import torch_geometric
 from tqdm import tqdm
 
-from ocpmodels.common import distutils
+from ocpmodels.common import dist_utils
 from ocpmodels.common.registry import registry
 from ocpmodels.common.relaxation.ml_relaxation import ml_relax
 from ocpmodels.common.utils import check_traj_files
@@ -89,7 +89,7 @@ def predict(
         results_file=None,
         disable_tqdm=False,
     ):
-        if distutils.is_master() and not disable_tqdm:
+        if dist_utils.is_master() and not disable_tqdm:
             logging.info("Predicting on test.")
         assert isinstance(
             data_loader,
@@ -98,7 +98,7 @@ def predict(
                 torch_geometric.data.Batch,
             ),
         )
-        rank = distutils.get_rank()
+        rank = dist_utils.get_rank()
 
         if isinstance(data_loader, torch_geometric.data.Batch):
             data_loader = [[data_loader]]
@@ -251,7 +251,7 @@ def train(self, disable_eval_tqdm=False):
                 )
                 if (
                     self.step % self.config["print_every"] == 0
-                    and distutils.is_master()
+                    and dist_utils.is_master()
                     and not self.is_hpo
                 ):
                     log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()]
@@ -376,11 +376,11 @@ def compute_loss(self, out, batch_list):
                 train_loss_force_normalizer = 3.0 * weight.sum()
 
                 # add up normalizer to obtain global normalizer
-                distutils.all_reduce(train_loss_force_normalizer)
+                dist_utils.all_reduce(train_loss_force_normalizer)
 
                 # perform loss normalization before backprop
                 train_loss_force_normalized = train_loss_force_unnormalized * (
-                    distutils.get_world_size() / train_loss_force_normalizer
+                    dist_utils.get_world_size() / train_loss_force_normalizer
                 )
                 loss.append(train_loss_force_normalized)
 
@@ -534,7 +534,7 @@ def run_relaxations(self, split="val"):
                 )
 
         if self.config["task"].get("write_pos", False):
-            rank = distutils.get_rank()
+            rank = dist_utils.get_rank()
             pos_filename = os.path.join(
                 self.config["results_dir"], f"relaxed_pos_{rank}.npz"
             )
@@ -545,15 +545,15 @@ def run_relaxations(self, split="val"):
                 chunk_idx=chunk_idx,
             )
 
-            distutils.synchronize()
-            if distutils.is_master():
+            dist_utils.synchronize()
+            if dist_utils.is_master():
                 gather_results = defaultdict(list)
                 full_path = os.path.join(
                     self.config["results_dir"],
                     "relaxed_positions.npz",
                 )
 
-                for i in range(distutils.get_world_size()):
+                for i in range(dist_utils.get_world_size()):
                     rank_path = os.path.join(
                         self.config["results_dir"],
                         f"relaxed_pos_{i}.npz",
@@ -586,12 +586,12 @@ def run_relaxations(self, split="val"):
                 aggregated_metrics = {}
                 for k in metrics:
                     aggregated_metrics[k] = {
-                        "total": distutils.all_reduce(
+                        "total": dist_utils.all_reduce(
                             metrics[k]["total"],
                             average=False,
                             device=self.device,
                         ),
-                        "numel": distutils.all_reduce(
+                        "numel": dist_utils.all_reduce(
                             metrics[k]["numel"],
                             average=False,
                             device=self.device,
@@ -611,7 +611,7 @@ def run_relaxations(self, split="val"):
                         split=split,
                     )
 
-                if distutils.is_master():
+                if dist_utils.is_master():
                     logging.info(metrics)
 
         if self.ema:
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index eeb1b50356..b4395cdcce 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -19,7 +19,7 @@
 from torch_geometric.data import Data
 from tqdm import tqdm
 
-from ocpmodels.common import distutils
+from ocpmodels.common import dist_utils
 from ocpmodels.common.registry import registry
 from ocpmodels.common.relaxation.ml_relaxation import ml_relax
 from ocpmodels.common.utils import OCP_TASKS, check_traj_files
@@ -94,7 +94,7 @@ def load_task(self):
 
     @torch.no_grad()
     def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False):
-        if distutils.is_master() and not disable_tqdm:
+        if dist_utils.is_master() and not disable_tqdm:
             logging.info("Predicting on test.")
         assert isinstance(
             loader,
@@ -103,7 +103,7 @@ def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False)
                 torch_geometric.data.Batch,
             ),
         )
-        rank = distutils.get_rank()
+        rank = dist_utils.get_rank()
 
         if isinstance(loader, torch_geometric.data.Batch):
             loader = [[loader]]
@@ -503,11 +503,11 @@ def compute_loss(self, preds, batch_list):
                 train_loss_force_normalizer = 3.0 * weight.sum()
 
                 # add up normalizer to obtain global normalizer
-                distutils.all_reduce(train_loss_force_normalizer)
+                dist_utils.all_reduce(train_loss_force_normalizer)
 
                 # perform loss normalization before backprop
                 train_loss_force_normalized = train_loss_force_unnormalized * (
-                    distutils.get_world_size() / train_loss_force_normalizer
+                    dist_utils.get_world_size() / train_loss_force_normalizer
                 )
                 loss.append(train_loss_force_normalized)
 
@@ -624,9 +624,9 @@ def log_train_metrics(self, end_of_epoch=False):
         )
         if (
             self.step % self.config["print_every"] == 0
-            and distutils.is_master()
+            and dist_utils.is_master()
             and not self.is_hpo
-        ) or (distutils.is_master() and end_of_epoch):
+        ) or (dist_utils.is_master() and end_of_epoch):
             if not self.silent:
                 log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()]
                 print(
@@ -859,7 +859,7 @@ def run_relaxations(self, split="val"):
                 )
 
         if self.config["task"].get("write_pos", False):
-            rank = distutils.get_rank()
+            rank = dist_utils.get_rank()
             pos_filename = os.path.join(
                 self.config["results_dir"], f"relaxed_pos_{rank}.npz"
             )
@@ -870,15 +870,15 @@ def run_relaxations(self, split="val"):
                 chunk_idx=chunk_idx,
             )
 
-            distutils.synchronize()
-            if distutils.is_master():
+            dist_utils.synchronize()
+            if dist_utils.is_master():
                 gather_results = defaultdict(list)
                 full_path = os.path.join(
                     self.config["results_dir"],
                     "relaxed_positions.npz",
                 )
 
-                for i in range(distutils.get_world_size()):
+                for i in range(dist_utils.get_world_size()):
                     rank_path = os.path.join(
                         self.config["results_dir"],
                         f"relaxed_pos_{i}.npz",
@@ -911,12 +911,12 @@ def run_relaxations(self, split="val"):
                 aggregated_metrics = {}
                 for k in metrics:
                     aggregated_metrics[k] = {
-                        "total": distutils.all_reduce(
+                        "total": dist_utils.all_reduce(
                             metrics[k]["total"],
                             average=False,
                             device=self.device,
                         ),
-                        "numel": distutils.all_reduce(
+                        "numel": dist_utils.all_reduce(
                             metrics[k]["numel"],
                             average=False,
                             device=self.device,
@@ -936,7 +936,7 @@ def run_relaxations(self, split="val"):
                         split=split,
                     )
 
-                if distutils.is_master():
+                if dist_utils.is_master():
                     logging.info(metrics)
 
         if self.ema:

From 2728eb124168bfd153556020f3d3e38fdd861a3f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 22:26:22 -0500
Subject: [PATCH 140/273] typo

---
 ocpmodels/common/exp_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 44e56ea0fe..f59a540919 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -107,7 +107,7 @@ def print_status(self):
         print("{:32} : {}".format("Algorithm's budgets", str(self.budgets)))
         sq_cmd = (
             "/opt/slurm/bin/squeue"
-            if "CC_CUSTER" not in os.environ
+            if "CC_CLUSTER" not in os.environ
             else "/opt/software/slurm/bin/squeue"
         )
         sq = set(

From 6967e3ec93559df9799d8a0338257bab74aa9213 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 22:28:54 -0500
Subject: [PATCH 141/273] clean trailing line

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 5ba27a16cf..42bc745809 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -196,7 +196,7 @@ def continue_orion_exp(trainer_config):
         return trainer_config
 
     trainer_config["checkpoint"] = str(resume_ckpts[-1])
-    resume_url = (resume_dir / "wandb_url.txt").read_text()
+    resume_url = (resume_dir / "wandb_url.txt").read_text().strip()
     trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1]
 
     print(

From dfadf1a442ed350c9388db8669ac42f9e3388507 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 15 Jan 2023 23:59:17 -0500
Subject: [PATCH 142/273] use tmpdir env var

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 7c39a9d987..aefe165f58 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -310,7 +310,7 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config):
 
     print("\nMoving data to slurm tmpdir", flush=True)
 
-    tmp_dir = Path(f"/Tmp/slurm.{JOB_ID}.0")
+    tmp_dir = os.environ.get("SLURM_TMPDIR") or Path(f"/Tmp/slurm.{JOB_ID}.0")
     for s, split in trainer_config["dataset"].items():
         if not isinstance(split, dict):
             continue

From 3023d54194cb3ec2242c1ce9246bda57e76f0397 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 11:17:12 -0500
Subject: [PATCH 143/273] update

---
 configs/exps/icml/qm9/fanet-manual.yaml    | 57 ++++++++++++++++++++++
 configs/exps/icml/qm9/fanet-orion-qm9.yaml |  4 +-
 main.py                                    | 17 +++++--
 3 files changed, 72 insertions(+), 6 deletions(-)
 create mode 100644 configs/exps/icml/qm9/fanet-manual.yaml

diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml
new file mode 100644
index 0000000000..b45b7e4587
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-manual.yaml
@@ -0,0 +1,57 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 12GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: main
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, fanet-qm9-v1.0.2-continued
+  log_train_every: 100
+  optim:
+    warmup_steps: 2000
+    # parameters EMA
+    ema_decay: 0.999
+    decay_steps: max_steps
+    scheduler: LinearWarmupCosineAnnealingLR
+    batch_size: 64
+    initial_lr: 0.001
+    max_epochs: 1500
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels
+  frame_averaging: 3D
+  fa_frames: random
+  model:
+    edge_embed_type: all_rij
+    energy_head: weighted-av-initial-embeds
+    graph_norm: True
+    hidden_channels: 416
+    max_num_neighbors: 40
+    mp_type: updownscale
+    num_filters: 512
+    num_gaussians: 100
+    num_interactions: 3
+    otf_graph: false
+    pg_hidden_channels: 0
+    phys_embeds: false
+    phys_hidden_channels: 0
+    second_layer_MLP: false
+    skip_co: false
+    tag_hidden_channels: 0
+    use_pbc: false
+    regress_forces: ""
+
+
+runs:
+  - {}
+  - model:
+      mp_type: base_with_att
\ No newline at end of file
diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index 983e67f951..97aa0a69bf 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -36,10 +36,10 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-qm9-v1.0.2
+  unique_exp_name: fanet-qm9-v2.0.0
 
   space:
-    optim/max_epochs: fidelity(50, 300, base=6)
+    optim/max_epochs: fidelity(100, 2000, base=5)
     optim/lr_initial: loguniform(1e-4, 5e-3, precision=2)
     model/graph_norm: choices([True, False])
     model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
diff --git a/main.py b/main.py
index d7505e383f..67fb6b6546 100644
--- a/main.py
+++ b/main.py
@@ -101,6 +101,13 @@ def run(self, orion_exp=None):
                     self.hparams["orion_unique_exp_name"] = orion_exp.name
                 except ReservationRaceCondition:
                     orion_race_condition = True
+                    import wandb
+
+                    if wandb.run is not None:
+                        if wandb.run.tags:
+                            wandb.run.tags = wandb.run.tags + ("RaceCondition",)
+                        else:
+                            wandb.run.tags = ("RaceCondition",)
 
         should_be_0 = dist_utils.get_rank()
         hp_list = [self.hparams, should_be_0, orion_race_condition]
@@ -147,10 +154,12 @@ def run(self, orion_exp=None):
         # print("objective post-broadcast: ", objective)
 
         if orion_exp is not None:
-            orion_exp.observe(
-                orion_trial,
-                [{"type": "objective", "name": "energy_mae", "value": objective}],
-            )
+            if objective is not None:
+                orion_exp.observe(
+                    orion_trial,
+                    [{"type": "objective", "name": "energy_mae", "value": objective}],
+                )
+            print("Received None objective from worker. Skipping observation.")
 
 
 if __name__ == "__main__":

From 42e2f48483a84c35d892b9f5d26ed943ac72b9fc Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 11:19:53 -0500
Subject: [PATCH 144/273] hotfix tmp dir

---
 ocpmodels/common/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index aefe165f58..edade03673 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -310,7 +310,8 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config):
 
     print("\nMoving data to slurm tmpdir", flush=True)
 
-    tmp_dir = os.environ.get("SLURM_TMPDIR") or Path(f"/Tmp/slurm.{JOB_ID}.0")
+    tmp_dir = os.environ.get("SLURM_TMPDIR") or f"/Tmp/slurm.{JOB_ID}.0"
+    tmp_dir = Path(tmp_dir)
     for s, split in trainer_config["dataset"].items():
         if not isinstance(split, dict):
             continue

From c0d08ab5b2d6402f6fe021ebf819d905f3fd9d01 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 14:02:20 -0500
Subject: [PATCH 145/273] beluga wandb offline

---
 sbatch.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sbatch.py b/sbatch.py
index 2a6cfd57e1..0d6affdfbe 100644
--- a/sbatch.py
+++ b/sbatch.py
@@ -38,7 +38,7 @@
 else
     conda activate {env}
 fi
-
+{wandb_offline}
 srun --output={output} {python_command}
 """
 
@@ -217,6 +217,7 @@ def write_orion_config(args, outdir):
 if __name__ == "__main__":
     # has the submission been successful?
     success = False
+    wandb_offline = ""
     sbatch_py_vars = {}
 
     # repository root
@@ -307,6 +308,9 @@ def write_orion_config(args, outdir):
     if "a100" in args.env:
         modules += ["cuda/11.2"]
 
+    if os.environ.get("CC_CLUSTER") == "beluga":
+        wandb_offline = "wandb offline\necho 'wandb offline'"
+
     # format string template with defaults + command-line args
     script = template.format(
         code_loc=(str(resolve(args.code_loc)) if args.code_loc else str(root)),
@@ -322,6 +326,7 @@ def write_orion_config(args, outdir):
         sbatch_params=make_sbatch_params(sbatch_params),
         sbatch_py_vars=make_sbatch_py_vars(sbatch_py_vars),
         virtualenv=virtualenv,
+        wandb_offline=wandb_offline,
     )
 
     # default script path to execute `sbatch {script_path}/script_{now()}.sh`

From 96e476c8645bff70581da01edc74a9f6a5f5e5ca Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 15:00:41 -0500
Subject: [PATCH 146/273] avoir url file on Beluga

---
 ocpmodels/common/logger.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index 0a8a432719..380401332c 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -132,8 +132,9 @@ def __init__(self, trainer_config):
             wandb.save(str(sbatch_files[0]))
 
         self.url = wandb.run.get_url()
-        with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f:
-            f.write(self.url + "\n")
+        if self.url:
+            with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f:
+                f.write(self.url + "\n")
         if not CLUSTER.drac:
             self.collect_output_files(policy="live")
             self.collect_output_files(policy="end")

From 5e6bf83d918d9ee35d6d27571948a282e44f28f1 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 15:23:23 -0500
Subject: [PATCH 147/273] report 1e12 if Nan

---
 main.py                              | 24 ++++++++++--------------
 ocpmodels/common/dist_utils.py       | 12 +++++++++---
 ocpmodels/trainers/single_trainer.py |  2 +-
 3 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/main.py b/main.py
index 67fb6b6546..b3f02d599b 100644
--- a/main.py
+++ b/main.py
@@ -109,13 +109,9 @@ def run(self, orion_exp=None):
                         else:
                             wandb.run.tags = ("RaceCondition",)
 
-        should_be_0 = dist_utils.get_rank()
-        hp_list = [self.hparams, should_be_0, orion_race_condition]
-        # print("hparams pre-broadcast: ", hparams)
-        dist_utils.broadcast_object_list(hp_list)
-        self.hparams, should_be_0, orion_race_condition = hp_list
-        # print("hparams post-broadcast: ", hparams)
-        assert should_be_0 == 0
+        self.hparams, orion_race_condition = dist_utils.broadcast_from_master(
+            self.hparams, orion_race_condition
+        )
         if orion_race_condition:
             if dist_utils.is_master():
                 shutil.rmtree(self.trainer_config["run_dir"])
@@ -146,20 +142,20 @@ def run(self, orion_exp=None):
         if self.trainer.logger is not None:
             self.trainer.logger.log({"Total time": time.time() - start_time})
 
-        objective = self.trainer.objective
-        # print("objective pre-broadcast: ", objective)
-        o_list = [objective]
-        dist_utils.broadcast_object_list(o_list)
-        objective = o_list[0]
-        # print("objective post-broadcast: ", objective)
+        objective = dist_utils.broadcast_from_master(self.trainer.objective)
 
         if orion_exp is not None:
+            if objective is None:
+                if signal == "loss_is_nan":
+                    objective = 1e12
+                    print("Received NaN objective from worker. Setting to 1e12.")
+                else:
+                    print("Received None objective from worker. Skipping observation.")
             if objective is not None:
                 orion_exp.observe(
                     orion_trial,
                     [{"type": "objective", "name": "energy_mae", "value": objective}],
                 )
-            print("Received None objective from worker. Skipping observation.")
 
 
 if __name__ == "__main__":
diff --git a/ocpmodels/common/dist_utils.py b/ocpmodels/common/dist_utils.py
index d4f4c13894..024b98e280 100644
--- a/ocpmodels/common/dist_utils.py
+++ b/ocpmodels/common/dist_utils.py
@@ -95,15 +95,21 @@ def broadcast(tensor, src, group=dist.group.WORLD, async_op=False):
     dist.broadcast(tensor, src, group, async_op)
 
 
-def broadcast_object_list(obj_list, src=0):
+def broadcast_from_master(*obj_list):
     if get_world_size() == 1:
-        return
+        if len(obj_list) == 1:
+            return obj_list[0]
+        return obj_list
+    obj_list = list(obj_list)
     dist.broadcast_object_list(
         obj_list,
-        src=src,
+        src=0,
         group=dist.group.WORLD,
         device=torch.device(f"cuda:{get_rank()}"),
     )
+    if len(obj_list) == 1:
+        return obj_list[0]
+    return obj_list
 
 
 def all_reduce(data, group=dist.group.WORLD, average=False, device=None):
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index b4395cdcce..2b43628a61 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -262,7 +262,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 if torch.isnan(loss["total_loss"]):
                     print("\n\n >>> 🛑 Loss is NaN. Stopping training.\n\n")
                     self.logger.add_tags(["nan_loss"])
-                    return True
+                    return "loss_is_nan"
                 self._backward(loss)
 
                 # Compute metrics.

From cad68010d585295381a8c318c5335ba582b3bda9 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 16:03:25 -0500
Subject: [PATCH 148/273] clearer prints

---
 ocpmodels/tasks/task.py            | 2 ++
 ocpmodels/trainers/base_trainer.py | 1 +
 2 files changed, 3 insertions(+)

diff --git a/ocpmodels/tasks/task.py b/ocpmodels/tasks/task.py
index 0bbd72de32..bcef87446f 100644
--- a/ocpmodels/tasks/task.py
+++ b/ocpmodels/tasks/task.py
@@ -19,7 +19,9 @@ def __init__(self, config):
     def setup(self, trainer):
         self.trainer = trainer
         if self.config.get("checkpoint") is not None:
+            print("\n🔵 Resuming:\n  • ", end="", flush=True)
             self.trainer.load_checkpoint(self.config["checkpoint"])
+            print()
 
         # save checkpoint path to runner state for slurm resubmissions
         self.chkpt_path = os.path.join(
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 52adcbdf9a..244ddfd317 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -894,6 +894,7 @@ def eval_all_splits(
             console = Console()
             console.print(table)
             print()
+            print("\n• Trainer objective set to:", self.objective, end="\n\n")
 
     def rotate_graph(self, batch, rotation=None):
         """Rotate all graphs in a batch

From cc3ed561729a0573496750a176724f66093e1084 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 18:05:40 -0500
Subject: [PATCH 149/273] Early stop from min_lr

---
 ocpmodels/modules/scheduler.py       | 41 ++++++++++++++++++++++++----
 ocpmodels/trainers/base_trainer.py   |  6 +++-
 ocpmodels/trainers/single_trainer.py |  4 ++-
 3 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index e1c203059a..e93fcf78a7 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -100,13 +100,36 @@ class EarlyStopper:
     """
 
     def __init__(
-        self, patience=7, mode="min", min_abs_change=1e-5, store_all_steps=True
+        self,
+        patience=7,
+        mode="min",
+        min_abs_change=1e-5,
+        store_all_steps=True,
+        min_lr=-1,
     ):
+        """
+        Whether train should stop or not.
+
+        Args:
+            patience (int, optional): How many calls to `should_stop` with no
+                improvement before stopping training. Defaults to 7.
+            mode (str, optional): "min" or "max". Defaults to "min".
+            min_abs_change (float, optional): Minimum metric change to be considered an
+                improvement. Defaults to 1e-5.
+            store_all_steps (bool, optional): Whether to store all metrics passed to
+                `should_stop` or only the last `patience` ones. Defaults to True.
+            min_lr (bool, optional): Whether to stop when the current learning rate
+                reaches the . Defaults to -1.
+
+        Raises:
+            ValueError: Unknown mode (neither min nor max)
+        """
         self.patience = patience
         self.mode = mode
         self.counter = 0
         self.min_abs_change = min_abs_change
         self.store_all_steps = store_all_steps
+        self.min_lr = min_lr
         self.metrics = []
 
         if self.mode == "min":
@@ -116,12 +139,17 @@ def __init__(
         else:
             raise ValueError("mode must be either min or max")
 
-        self.early_stop = False
+        self.early_stop = ""
 
-    def should_stop(self, metric):
+    def should_stop(self, metric, lr=None):
         """
-        Returns True if the metric has not improved for a certain number of
-        steps. False otherwise. Stores the metric in `self.metrics`: all the steps if
+        Returns why the training should stop:
+        • Empty string if the training shouldn't stop
+        • "metric" if the metric has not improved for a certain number of
+          steps.
+        • "lr" if the learning rate has reached the minimum value.
+
+        Stores the metric in `self.metrics`: all the steps if
         `self.store_all_steps` is `True`, otherwise only the last `n=self.patience`.
 
         Args:
@@ -151,6 +179,9 @@ def should_stop(self, metric):
         if self.counter >= self.patience:
             self.early_stop = True
 
+        if lr is not None and lr <= self.min_lr:
+            self.early_stop = True
+
         return self.early_stop
 
     @property
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 244ddfd317..8500676f1a 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -83,7 +83,11 @@ def __init__(self, **kwargs):
         self.datasets = {}
         self.samplers = {}
         self.loaders = {}
-        self.early_stopper = EarlyStopper(patience=10, min_abs_change=1e-5)
+        self.early_stopper = EarlyStopper(
+            patience=10,
+            min_abs_change=1e-5,
+            min_lr=self.config["optim"].get("min_lr", -1),
+        )
 
         if torch.cuda.is_available() and not self.cpu:
             self.device = torch.device(f"cuda:{self.config['local_rank']}")
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 2b43628a61..7fcdc9cd39 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -330,7 +330,9 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                             checkpoint_file="best_checkpoint.pt",
                             training_state=False,
                         )
-                    if self.early_stopper.should_stop(current_val_metric):
+                    if self.early_stopper.should_stop(
+                        current_val_metric, self.scheduler.get_lr()
+                    ):
                         print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n")
                         if self.logger:
                             self.logger.add_tags(["E-S"])

From 2c067ab77295149f6e8e06c83fbf9c3a64a2b460 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 18:05:57 -0500
Subject: [PATCH 150/273] update configs

---
 configs/exps/icml/qm9/fanet-manual.yaml | 43 ++++++++++++++++---------
 configs/exps/qm7x/schnet.yaml           | 37 ++++++++++++---------
 configs/models/tasks/qm7x.yaml          |  4 ++-
 3 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml
index b45b7e4587..91fc451cbb 100644
--- a/configs/exps/icml/qm9/fanet-manual.yaml
+++ b/configs/exps/icml/qm9/fanet-manual.yaml
@@ -3,7 +3,7 @@ job:
   mem: 12GB
   cpus: 4
   gres: gpu:16gb:1
-  partition: main
+  partition: long
 
 default:
   wandb_project: ocp-qm
@@ -12,15 +12,6 @@ default:
   test_ri: true
   wandb_tags: qm9, fanet-qm9-v1.0.2-continued
   log_train_every: 100
-  optim:
-    warmup_steps: 2000
-    # parameters EMA
-    ema_decay: 0.999
-    decay_steps: max_steps
-    scheduler: LinearWarmupCosineAnnealingLR
-    batch_size: 64
-    initial_lr: 0.001
-    max_epochs: 1500
   note:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
     optim: batch_size, lr_initial
@@ -30,19 +21,29 @@ default:
     targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels
   frame_averaging: 3D
   fa_frames: random
+  optim:
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    decay_steps: max_steps
+    scheduler: LinearWarmupCosineAnnealingLR
+    batch_size: 64
+    initial_lr: 0.001
+    max_epochs: 1500
   model:
+    cutoff: 5.0
     edge_embed_type: all_rij
     energy_head: weighted-av-initial-embeds
     graph_norm: True
     hidden_channels: 416
     max_num_neighbors: 40
     mp_type: updownscale
-    num_filters: 512
-    num_gaussians: 100
-    num_interactions: 3
+    num_filters: 256
+    num_gaussians: 50
+    num_interactions: 5
     otf_graph: false
-    pg_hidden_channels: 0
-    phys_embeds: false
+    pg_hidden_channels: 32
+    phys_embeds: true
     phys_hidden_channels: 0
     second_layer_MLP: false
     skip_co: false
@@ -54,4 +55,14 @@ default:
 runs:
   - {}
   - model:
-      mp_type: base_with_att
\ No newline at end of file
+      mp_type: base_with_att
+  - model:
+      cutoff: 6.0
+  - optim:
+      initial_lr: 0.0005
+  - optim:
+      batch_size: 128
+  - optim:
+      batch_size: 512
+  - model:
+      energy_head: ""
diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml
index 73e5ace742..9282c3ffe2 100644
--- a/configs/exps/qm7x/schnet.yaml
+++ b/configs/exps/qm7x/schnet.yaml
@@ -25,42 +25,47 @@ default:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions
     optim: batch_size, lr_initial
   optim:
-    batch_size: 2048
+    batch_size: 512
     warmup_steps: 1000
     lr_initial: 0.0005
     # parameters EMA
     ema_decay: 0.999
-    decay_steps: 750000
-    decay_rate: 0.05
-    max_steps: 200000
+    energy_coefficient: 1
+    energy_grad_coefficient: 10
+    force_coefficient: 100
+    loss_energy: mse
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.5
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.00001
+    verbose: true
   model:
     hidden_channels: 256
-    num_filters: 256
+    num_filters: 128
     num_gaussians: 100
     num_interactions: 6
     cutoff: 5.0
+    regress_forces: "from_energy"
 
 runs:
   - {}
   - optim:
-      batch_size: 2048
+      lr_initial: 0.001
+      batch_size: 1024
   - optim:
-      batch_size: 4096
+      batch_size: 256
   - optim:
       lr_initial: 0.001
   - optim:
       lr_initial: 0.001
       batch_size: 2048
   - model:
-      num_gaussians: 200
-  - model:
-      hidden_channels: 1024
-  - model:
-      num_filters: 1024
-  - model:
-      num_interactions: 8
+      hidden_channels: 512
   - model:
       num_interactions: 6
       num_gaussians: 20
-      num_filters: 64
-      hidden_channels: 1024
\ No newline at end of file
+      hidden_channels: 512
\ No newline at end of file
diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml
index 98de512a2d..0a6fa33094 100644
--- a/configs/models/tasks/qm7x.yaml
+++ b/configs/models/tasks/qm7x.yaml
@@ -18,9 +18,11 @@ default:
 
   optim:
     optimizer: AdamW
-    force_coefficient: 30
     energy_coefficient: 1
     energy_grad_coefficient: 10
+    force_coefficient: 100
+    loss_energy: mse
+    loss_force: mse
 
   normalizer: null
   graph_rewiring: ""

From 3f572cd83176e444e5d106208c6919affa972aa2 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 18:11:21 -0500
Subject: [PATCH 151/273] handle float eval_every

---
 configs/exps/qm7x/schnet.yaml        | 2 ++
 ocpmodels/trainers/single_trainer.py | 7 +++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml
index 9282c3ffe2..a62f41b469 100644
--- a/configs/exps/qm7x/schnet.yaml
+++ b/configs/exps/qm7x/schnet.yaml
@@ -28,6 +28,7 @@ default:
     batch_size: 512
     warmup_steps: 1000
     lr_initial: 0.0005
+    eval_every: 0.34
     # parameters EMA
     ema_decay: 0.999
     energy_coefficient: 1
@@ -35,6 +36,7 @@ default:
     force_coefficient: 100
     loss_energy: mse
     loss_force: mse
+    eval_every:
     # all below is for the scheduler
     scheduler: ReduceLROnPlateau
     mode: min
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 7fcdc9cd39..93306633ad 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -193,6 +193,8 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         n_train = len(self.loaders["train"])
         epoch_int = 0
         eval_every = self.config["optim"].get("eval_every", n_train)
+        if eval_every < 1:
+            eval_every = int(n_train * eval_every)
         if self.config["print_every"] < 0:
             self.config["print_every"] = n_train
         primary_metric = self.config["task"].get(
@@ -211,9 +213,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         model_run_time = 0
 
         if not self.silent:
-            print(f"--- 🔄 Beginning of Training @ {self.now}---\n")
-            print(f"Logging  train metrics every {log_train_every} steps")
+            print(f"\n--- 🔄 Beginning of Training @ {self.now}---\n")
+            print(f"\nLogging  train metrics every {log_train_every} steps")
             print(f"Printing train metrics every {self.config['print_every']} steps")
+            print(f"\nEvaluating every {eval_every} steps\n")
 
         for epoch_int in range(start_epoch, self.config["optim"]["max_epochs"]):
 

From 6c8c48ef85c50c47fdf5d7aee6438b294efd41fc Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 23:00:21 -0500
Subject: [PATCH 152/273] increase ES patience

---
 configs/exps/icml/qm9/fanet-manual.yaml | 12 +++++++++++-
 configs/exps/qm7x/schnet.yaml           |  3 +--
 ocpmodels/trainers/base_trainer.py      |  2 +-
 ocpmodels/trainers/single_trainer.py    |  2 +-
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml
index 91fc451cbb..6830f52f21 100644
--- a/configs/exps/icml/qm9/fanet-manual.yaml
+++ b/configs/exps/icml/qm9/fanet-manual.yaml
@@ -1,4 +1,4 @@
-# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+# scheduler reduce lr on plateau
 job:
   mem: 12GB
   cpus: 4
@@ -30,6 +30,16 @@ default:
     batch_size: 64
     initial_lr: 0.001
     max_epochs: 1500
+    loss_energy: mse
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.5
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.00001
+    verbose: true
   model:
     cutoff: 5.0
     edge_embed_type: all_rij
diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet.yaml
index a62f41b469..b5ecf97358 100644
--- a/configs/exps/qm7x/schnet.yaml
+++ b/configs/exps/qm7x/schnet.yaml
@@ -1,6 +1,6 @@
 # trainset has 4068193 samples
 job:
-  mem: 48GB
+  mem: 32GB
   cpus: 8
   gres: gpu:16gb:1
   partition: long
@@ -36,7 +36,6 @@ default:
     force_coefficient: 100
     loss_energy: mse
     loss_force: mse
-    eval_every:
     # all below is for the scheduler
     scheduler: ReduceLROnPlateau
     mode: min
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 8500676f1a..e8917a389e 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -84,7 +84,7 @@ def __init__(self, **kwargs):
         self.samplers = {}
         self.loaders = {}
         self.early_stopper = EarlyStopper(
-            patience=10,
+            patience=15,
             min_abs_change=1e-5,
             min_lr=self.config["optim"].get("min_lr", -1),
         )
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 93306633ad..5a6ed887b1 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -192,7 +192,7 @@ def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False)
     def train(self, disable_eval_tqdm=True, debug_batches=-1):
         n_train = len(self.loaders["train"])
         epoch_int = 0
-        eval_every = self.config["optim"].get("eval_every", n_train)
+        eval_every = self.config["optim"].get("eval_every", n_train) or n_train
         if eval_every < 1:
             eval_every = int(n_train * eval_every)
         if self.config["print_every"] < 0:

From 6e242cff339e938901cf63bfa49b5c3c0bfe2611 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 23:29:23 -0500
Subject: [PATCH 153/273] fix warmup scheduler state dict

---
 configs/exps/icml/qm9/fanet-manual.yaml |  8 ++---
 ocpmodels/trainers/base_trainer.py      | 43 +++++++++++++------------
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml
index 6830f52f21..0fc5b23986 100644
--- a/configs/exps/icml/qm9/fanet-manual.yaml
+++ b/configs/exps/icml/qm9/fanet-manual.yaml
@@ -43,9 +43,9 @@ default:
   model:
     cutoff: 5.0
     edge_embed_type: all_rij
-    energy_head: weighted-av-initial-embeds
+    energy_head: weighted-av-final-embeds
     graph_norm: True
-    hidden_channels: 416
+    hidden_channels: 350
     max_num_neighbors: 40
     mp_type: updownscale
     num_filters: 256
@@ -68,11 +68,11 @@ runs:
       mp_type: base_with_att
   - model:
       cutoff: 6.0
-  - optim:
-      initial_lr: 0.0005
   - optim:
       batch_size: 128
   - optim:
       batch_size: 512
   - model:
       energy_head: ""
+  - model:
+      energy_head: "weighted-av-initial-embeds"
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index e8917a389e..8830a29c01 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -511,27 +511,30 @@ def save(
     ):
         if not self.is_debug and dist_utils.is_master():
             if training_state:
-                save_checkpoint(
-                    {
-                        "epoch": self.epoch,
-                        "step": self.step,
-                        "state_dict": self.model.state_dict(),
-                        "optimizer": self.optimizer.state_dict(),
-                        "scheduler": self.scheduler.scheduler.state_dict()
-                        if self.scheduler.scheduler_type != "Null"
-                        else None,
-                        "warmup_scheduler": self.scheduler.warmup_scheduler.state_dict()
-                        if hasattr(self.scheduler, "warmup_scheduler")
-                        else None,
-                        "normalizers": {
-                            key: value.state_dict()
-                            for key, value in self.normalizers.items()
-                        },
-                        "config": self.config,
-                        "val_metrics": metrics,
-                        "ema": self.ema.state_dict() if self.ema else None,
-                        "amp": self.scaler.state_dict() if self.scaler else None,
+                ckpt_dict = {
+                    "epoch": self.epoch,
+                    "step": self.step,
+                    "state_dict": self.model.state_dict(),
+                    "optimizer": self.optimizer.state_dict(),
+                    "scheduler": self.scheduler.scheduler.state_dict()
+                    if self.scheduler.scheduler_type != "Null"
+                    else None,
+                    "normalizers": {
+                        key: value.state_dict()
+                        for key, value in self.normalizers.items()
                     },
+                    "config": self.config,
+                    "val_metrics": metrics,
+                    "ema": self.ema.state_dict() if self.ema else None,
+                    "amp": self.scaler.state_dict() if self.scaler else None,
+                }
+                if self.scheduler.warmup_scheduler is not None:
+                    ckpt_dict[
+                        "warmup_scheduler"
+                    ] = self.scheduler.warmup_scheduler.state_dict()
+
+                save_checkpoint(
+                    ckpt_dict,
                     checkpoint_dir=self.config["checkpoint_dir"],
                     checkpoint_file=checkpoint_file,
                 )

From 76cac140221b01aa4ebf68867c27830ef741fb86 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 16 Jan 2023 23:30:25 -0500
Subject: [PATCH 154/273] fix load  warmup_scheduler

---
 ocpmodels/trainers/base_trainer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 8830a29c01..e22a648d03 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -409,8 +409,9 @@ def load_checkpoint(self, checkpoint_path):
             self.optimizer.load_state_dict(checkpoint["optimizer"])
         if "scheduler" in checkpoint and checkpoint["scheduler"] is not None:
             self.scheduler.scheduler.load_state_dict(checkpoint["scheduler"])
-        if checkpoint.get("warmup_scheduler") is not None and hasattr(
-            self.scheduler, "warmup_scheduler"
+        if (
+            checkpoint.get("warmup_scheduler") is not None
+            and self.scheduler.warmup_scheduler is not None
         ):
             self.scheduler.warmup_scheduler.load_state_dict(
                 checkpoint["warmup_scheduler"]

From a2f4d373d4583850f6e731f2aa07c595d035430f Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Tue, 17 Jan 2023 05:36:50 -0500
Subject: [PATCH 155/273] orion 2-3, baseline and top config

---
 configs/exps/icml/is2re-all/baseline.yaml     |  18 +++
 .../exps/icml/is2re-all/fanet-orion-2.yaml    |  10 +-
 .../exps/icml/is2re-all/fanet-orion-3.yaml    |  58 ++++++++++
 configs/exps/icml/is2re-all/top-config.yaml   | 107 ++++++++++++++++++
 configs/models/fanet.yaml                     |   1 +
 5 files changed, 189 insertions(+), 5 deletions(-)
 create mode 100644 configs/exps/icml/is2re-all/baseline.yaml
 create mode 100644 configs/exps/icml/is2re-all/fanet-orion-3.yaml
 create mode 100644 configs/exps/icml/is2re-all/top-config.yaml

diff --git a/configs/exps/icml/is2re-all/baseline.yaml b/configs/exps/icml/is2re-all/baseline.yaml
new file mode 100644
index 0000000000..45102f9e29
--- /dev/null
+++ b/configs/exps/icml/is2re-all/baseline.yaml
@@ -0,0 +1,18 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:4
+  partition: long
+  time: 20:00:00
+
+default:
+  test_ri: True
+  mode: train
+  wandb_tags: 'baseline'
+  cp_data_to_tmpdir: true
+
+runs:
+  - config: dpp-is2re-all
+    note: 'Baseline 4 gpus'
+  - config: schnet-is2re-all
+    note: 'Baseline 4 gpus'
diff --git a/configs/exps/icml/is2re-all/fanet-orion-2.yaml b/configs/exps/icml/is2re-all/fanet-orion-2.yaml
index ae26af2e9e..cf88591af6 100644
--- a/configs/exps/icml/is2re-all/fanet-orion-2.yaml
+++ b/configs/exps/icml/is2re-all/fanet-orion-2.yaml
@@ -3,7 +3,7 @@ job:
   mem: 32GB
   cpus: 4
   gres: gpu:rtx8000:1
-  time: 10:00:00
+  time: 14:00:00
   partition: long
 
 default:
@@ -32,15 +32,15 @@ default:
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
-  n_jobs: 216
+  n_jobs: 72
 
-  unique_exp_name: fanet-is2re-all-v1
+  unique_exp_name: fanet-is2re-all-v2
 
   space:
     optim/max_epochs: fidelity(15, 30, base=6)
     optim/lr_initial: loguniform(6e-4, 4e-3, precision=2)
     model/hidden_channels: uniform(8, 19, discrete=True)
-    model/mp_type: choices(["base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "updown_local_env"])
+    model/mp_type: choices(["base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "simple", "updown_local_env"])
     model/num_filters: uniform(3, 18, discrete=True)
     model/num_gaussians: uniform(50, 170, discrete=True)
     model/num_interactions: uniform(3, 7, discrete=True)
@@ -48,7 +48,7 @@ orion:
     model/phys_embeds: choices([True, False])
     model/tag_hidden_channels: uniform(0, 3, discrete=True)
     model/complex_mp: choices([True, False])
-    model/att_heads: choices([1,3,6])
+    model/att_heads: choices([1,3,5])
     model/second_layer_MLP: choices([True, False])
     model/skip_co: choices(["add", "concat", False])
     model/cutoff: choices([4.0, 6.0, 10.0])
diff --git a/configs/exps/icml/is2re-all/fanet-orion-3.yaml b/configs/exps/icml/is2re-all/fanet-orion-3.yaml
new file mode 100644
index 0000000000..1337d47b1e
--- /dev/null
+++ b/configs/exps/icml/is2re-all/fanet-orion-3.yaml
@@ -0,0 +1,58 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  time: 14:00:00
+  partition: long
+
+default:
+  wandb_project: ocp-3
+  config: fanet-is2re-all
+  mode: train
+  test_ri: true
+  wandb_tags: is2re-all, orion-3
+  cp_data_to_tmpdir: true
+  graph_rewiring: remove-tag-0
+  model:
+    graph_norm: True
+  frame_averaging: 2D
+  fa_frames: random
+  optim:
+    scheduler: LinearWarmupCosineAnnealingLR
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co
+    optim: lr_initial, warmup_steps
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 72
+
+  unique_exp_name: fanet-is2re-all-v2
+
+  space:
+    optim/max_epochs: fidelity(7, 15, base=6)
+    optim/lr_initial: loguniform(9e-4, 5e-3, precision=2)
+    model/hidden_channels: uniform(8, 16, discrete=True)
+    model/energy_head: choices(["weighted-av-final-embeds", False])
+    model/mp_type: choices(["base", "updownscale", "updownscale_base", "updown_local_env"])
+    model/num_filters: uniform(4, 18, discrete=True)
+    model/num_gaussians: uniform(30, 120, discrete=True)
+    model/num_interactions: uniform(4, 8, discrete=True)
+    model/pg_hidden_channels: uniform(1, 2, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/tag_hidden_channels: uniform(1, 2, discrete=True)
+    model/complex_mp: choices([True, False])
+    model/second_layer_MLP: choices([True, False])
+    model/skip_co: choices(["concat", False])
+    model/cutoff: choices([4.0, 6.0, 8.0])
+    model/edge_embed_type: choices([all_rij, all])
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2
diff --git a/configs/exps/icml/is2re-all/top-config.yaml b/configs/exps/icml/is2re-all/top-config.yaml
new file mode 100644
index 0000000000..0debe73c19
--- /dev/null
+++ b/configs/exps/icml/is2re-all/top-config.yaml
@@ -0,0 +1,107 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 15:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    edge_embed_type: all_rij
+  wandb_tags: 'best-config'
+  optim:
+    batch_size: 256
+    eval_batch_size: 256
+  cp_data_to_tmpdir: true
+
+runs:
+  - config: fanet-is2re-all
+    note: 'top-1-FA'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale
+      phys_embeds: False
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: False
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 448
+      num_gaussians: 99
+      num_interactions: 6
+      second_layer_MLP: True
+      skip_co: concat
+    optim:
+      lr_initial: 0.0019
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 9
+  - config: fanet-is2re-all
+    note: 'top-1-FA'
+    frame_averaging: 2D
+    fa_frames: all
+    model:
+      mp_type: updownscale
+      phys_embeds: False
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: False
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 448
+      num_gaussians: 99
+      num_interactions: 6
+      second_layer_MLP: True
+      skip_co: concat
+    optim:
+      lr_initial: 0.0019
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 9
+  - config: fanet-is2re-all
+    note: 'top-1-FA'
+    frame_averaging: DA
+    model:
+      mp_type: updownscale
+      phys_embeds: False
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: False
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 448
+      num_gaussians: 99
+      num_interactions: 6
+      second_layer_MLP: True
+      skip_co: concat
+    optim:
+      lr_initial: 0.0019
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 9
+  - config: fanet-is2re-all
+    note: 'top-1-FA'
+    frame_averaging: 3D
+    fa_frames: random
+    model:
+      mp_type: updownscale
+      phys_embeds: False
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: False
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 448
+      num_gaussians: 99
+      num_interactions: 6
+      second_layer_MLP: True
+      skip_co: concat
+    optim:
+      lr_initial: 0.0019
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 9
diff --git a/configs/models/fanet.yaml b/configs/models/fanet.yaml
index fe94635ca1..a8eb727333 100644
--- a/configs/models/fanet.yaml
+++ b/configs/models/fanet.yaml
@@ -79,6 +79,7 @@ is2re:
       batch_size: 256
       eval_batch_size: 256
       lr_initial: 0.001
+      lr_gamma: 0.1
       lr_milestones: # steps at which lr_initial <- lr_initial * lr_gamma
         - 18000
         - 27000

From b4c56af8529d974de2322e06619a826901fcc6c8 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Tue, 17 Jan 2023 06:11:03 -0500
Subject: [PATCH 156/273] skip co with atom concat

---
 configs/exps/icml/is2re-all/fanet-orion-3.yaml |  1 +
 ocpmodels/models/fanet.py                      | 17 ++++++++++++++---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/configs/exps/icml/is2re-all/fanet-orion-3.yaml b/configs/exps/icml/is2re-all/fanet-orion-3.yaml
index 1337d47b1e..87f498043f 100644
--- a/configs/exps/icml/is2re-all/fanet-orion-3.yaml
+++ b/configs/exps/icml/is2re-all/fanet-orion-3.yaml
@@ -20,6 +20,7 @@ default:
   fa_frames: random
   optim:
     scheduler: LinearWarmupCosineAnnealingLR
+    eval_every: 0.5
   note:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co
     optim: lr_initial, warmup_steps
diff --git a/ocpmodels/models/fanet.py b/ocpmodels/models/fanet.py
index 1d2a2a3bc2..28255ddf46 100644
--- a/ocpmodels/models/fanet.py
+++ b/ocpmodels/models/fanet.py
@@ -496,7 +496,7 @@ class FANet(BaseModel):
             (default: :obj:`50`)
         second_layer_MLP (bool): use 2-layers MLP at the end of the Embedding block.
         skip_co (str): add a skip connection between each interaction block and
-            energy-head.
+            energy-head. ("add", False, "concat", "concat_atom")
         edge_embed_type (str, in {'rij','all_rij','sh', 'all'}): input feature
             of the edge embedding block.
         edge_embed_hidden (int): size of edge representation.
@@ -590,6 +590,11 @@ def __init__(self, **kwargs):
         # Skip co
         if self.skip_co == "concat":
             self.mlp_skip_co = Linear((kwargs["num_interactions"] + 1), 1)
+        elif self.skip_co == "concat_atom":
+            self.mlp_skip_co = Linear(
+                ((kwargs["num_interactions"] + 1) * kwargs["hidden_channels"]),
+                kwargs["hidden_channels"],
+            )
 
     @conditional_grad(torch.enable_grad())
     def forces_forward(self, preds):
@@ -651,13 +656,19 @@ def energy_forward(self, data):
         # Interaction blocks
         energy_skip_co = []
         for interaction in self.interaction_blocks:
-            if self.skip_co:
+            if self.skip_co == "concat_atom":
+                energy_skip_co.append(h)
+            elif self.skip_co:
                 energy_skip_co.append(
                     self.output_block(h, edge_index, edge_weight, batch, alpha)
                 )
             h = h + interaction(h, edge_index, e)
 
-        # Output block
+        # Atom skip-co
+        if self.skip_co == "concat_atom":
+            energy_skip_co.append(h)
+            h = self.act(self.mlp_skip_co(torch.cat(energy_skip_co, dim=1)))
+
         energy = self.output_block(h, edge_index, edge_weight, batch, alpha)
 
         # Skip-connection

From 48d8e670fcdbfab596d0ae4e8ee49b88ba63b24f Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Tue, 17 Jan 2023 06:13:56 -0500
Subject: [PATCH 157/273] update orion-3

---
 configs/exps/icml/is2re-all/fanet-orion-3.yaml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/configs/exps/icml/is2re-all/fanet-orion-3.yaml b/configs/exps/icml/is2re-all/fanet-orion-3.yaml
index 87f498043f..8daecd138c 100644
--- a/configs/exps/icml/is2re-all/fanet-orion-3.yaml
+++ b/configs/exps/icml/is2re-all/fanet-orion-3.yaml
@@ -17,7 +17,6 @@ default:
   model:
     graph_norm: True
   frame_averaging: 2D
-  fa_frames: random
   optim:
     scheduler: LinearWarmupCosineAnnealingLR
     eval_every: 0.5
@@ -36,22 +35,24 @@ orion:
   unique_exp_name: fanet-is2re-all-v2
 
   space:
-    optim/max_epochs: fidelity(7, 15, base=6)
-    optim/lr_initial: loguniform(9e-4, 5e-3, precision=2)
-    model/hidden_channels: uniform(8, 16, discrete=True)
+    model/complex_mp: choices([True, False])
+    model/cutoff: choices([4.0, 6.0, 8.0])
+    model/edge_embed_type: choices(["all_rij", "all"])
     model/energy_head: choices(["weighted-av-final-embeds", False])
+    model/fa_frames: choices(["random", "se3-random"])
+    model/hidden_channels: uniform(8, 16, discrete=True)
+    model/max_num_neighbors: choices([30, 40, 50])
     model/mp_type: choices(["base", "updownscale", "updownscale_base", "updown_local_env"])
     model/num_filters: uniform(4, 18, discrete=True)
     model/num_gaussians: uniform(30, 120, discrete=True)
     model/num_interactions: uniform(4, 8, discrete=True)
     model/pg_hidden_channels: uniform(1, 2, discrete=True)
     model/phys_embeds: choices([True, False])
-    model/tag_hidden_channels: uniform(1, 2, discrete=True)
-    model/complex_mp: choices([True, False])
     model/second_layer_MLP: choices([True, False])
     model/skip_co: choices(["concat", False])
-    model/cutoff: choices([4.0, 6.0, 8.0])
-    model/edge_embed_type: choices([all_rij, all])
+    model/tag_hidden_channels: uniform(1, 2, discrete=True)
+    optim/lr_initial: loguniform(9e-4, 5e-3, precision=2)
+    optim/max_epochs: fidelity(7, 15, base=6)
   algorithms:
     asha:
       seed: 123

From 974fb12e608a0e1568d8688f832145724f734a11 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 17 Jan 2023 10:36:41 -0500
Subject: [PATCH 158/273] more explicit reason

---
 .../qm7x/{schnet.yaml => schnet-fanet.yaml}     | 15 +++++++++++++--
 configs/models/tasks/qm7x.yaml                  |  1 -
 ocpmodels/modules/scheduler.py                  | 17 +++++++++++------
 3 files changed, 24 insertions(+), 9 deletions(-)
 rename configs/exps/qm7x/{schnet.yaml => schnet-fanet.yaml} (79%)

diff --git a/configs/exps/qm7x/schnet.yaml b/configs/exps/qm7x/schnet-fanet.yaml
similarity index 79%
rename from configs/exps/qm7x/schnet.yaml
rename to configs/exps/qm7x/schnet-fanet.yaml
index b5ecf97358..8adab7d58a 100644
--- a/configs/exps/qm7x/schnet.yaml
+++ b/configs/exps/qm7x/schnet-fanet.yaml
@@ -42,7 +42,7 @@ default:
     factor: 0.5
     threshold: 0.001
     threshold_mode: abs
-    min_lr: 0.00001
+    min_lr: 0.000001
     verbose: true
   model:
     hidden_channels: 256
@@ -69,4 +69,15 @@ runs:
   - model:
       num_interactions: 6
       num_gaussians: 20
-      hidden_channels: 512
\ No newline at end of file
+      hidden_channels: 512
+  - config: fanet-qm7x-all
+    model:
+      force_decoder_type: mlp
+      edge_embed_type: all_rij
+      regress_forces: direct_with_gradient_target
+  - config: fanet-qm7x-all
+    model:
+      force_decoder_type: mlp
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
+      regress_forces: direct_with_gradient_target
diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml
index 0a6fa33094..81d9ace719 100644
--- a/configs/models/tasks/qm7x.yaml
+++ b/configs/models/tasks/qm7x.yaml
@@ -6,7 +6,6 @@ default:
     otf_graph: False
     max_num_neighbors: 40
     use_pbc: False
-    force_decoder_type: null
 
   task:
     dataset: qm7x
diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index e93fcf78a7..efab892f7e 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -177,16 +177,21 @@ def should_stop(self, metric, lr=None):
                 self.counter += 1
 
         if self.counter >= self.patience:
-            self.early_stop = True
+            self.early_stop = "metric"
 
         if lr is not None and lr <= self.min_lr:
-            self.early_stop = True
+            self.early_stop = "lr"
 
         return self.early_stop
 
     @property
     def reason(self):
-        return (
-            f"Early stopping after {self.counter} steps with no improvement:\n"
-            + " -> ".join([f"{m:.6f}" for m in self.metrics[-self.patience :]])
-        )
+        if self.early_stop == "metric":
+            return (
+                f"Early stopping after {self.counter} steps with no improvement:\n"
+                + " -> ".join([f"{m:.6f}" for m in self.metrics[-self.patience :]])
+            )
+        elif self.early_stop == "lr":
+            return f"Early stopping because learning rate reached {self.min_lr}"
+
+        return ""

From f00c655cc1666b673b8bef86a59bc686f46be088 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 17 Jan 2023 15:42:48 -0500
Subject: [PATCH 159/273] ES warmup epochs

---
 ocpmodels/modules/scheduler.py       |  7 ++++++-
 ocpmodels/trainers/base_trainer.py   | 15 ++++++++-------
 ocpmodels/trainers/single_trainer.py |  2 +-
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index efab892f7e..7c0a5fc071 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -106,6 +106,7 @@ def __init__(
         min_abs_change=1e-5,
         store_all_steps=True,
         min_lr=-1,
+        warmup_epochs=-1,
     ):
         """
         Whether train should stop or not.
@@ -130,6 +131,7 @@ def __init__(
         self.min_abs_change = min_abs_change
         self.store_all_steps = store_all_steps
         self.min_lr = min_lr
+        self.warmup_epochs = warmup_epochs
         self.metrics = []
 
         if self.mode == "min":
@@ -141,7 +143,7 @@ def __init__(
 
         self.early_stop = ""
 
-    def should_stop(self, metric, lr=None):
+    def should_stop(self, metric, lr=None, epoch=None):
         """
         Returns why the training should stop:
         • Empty string if the training shouldn't stop
@@ -176,6 +178,9 @@ def should_stop(self, metric, lr=None):
             else:
                 self.counter += 1
 
+        if self.warmup_epochs > 0 and epoch is not None and epoch < self.warmup_epochs:
+            self.counter = 0
+
         if self.counter >= self.patience:
             self.early_stop = "metric"
 
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index e22a648d03..a75db63bbe 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -36,7 +36,7 @@
 from ocpmodels.common.graph_transforms import RandomReflect, RandomRotate
 from ocpmodels.common.registry import registry
 from ocpmodels.common.timer import Times
-from ocpmodels.common.utils import JOB_ID, get_commit_hash, save_checkpoint
+from ocpmodels.common.utils import JOB_ID, get_commit_hash, save_checkpoint, resolve
 from ocpmodels.datasets.data_transforms import FrameAveraging, get_transforms
 from ocpmodels.modules.evaluator import Evaluator
 from ocpmodels.modules.exponential_moving_average import (
@@ -52,7 +52,7 @@ class BaseTrainer(ABC):
     def __init__(self, **kwargs):
 
         run_dir = kwargs["run_dir"]
-        model_name = kwargs["model"].pop("name")
+        model_name = kwargs["model"].pop("name", kwargs["model_name"])
         kwargs["model"]["graph_rewiring"] = kwargs.get("graph_rewiring")
 
         self.config = {
@@ -60,9 +60,9 @@ def __init__(self, **kwargs):
             "model_name": model_name,
             "gpus": dist_utils.get_world_size() if not kwargs["cpu"] else 0,
             "commit": get_commit_hash(),
-            "checkpoint_dir": str(Path(run_dir) / "checkpoints"),
-            "results_dir": str(Path(run_dir) / "results"),
-            "logs_dir": str(Path(run_dir) / "logs"),
+            "checkpoint_dir": str(resolve(run_dir) / "checkpoints"),
+            "results_dir": str(resolve(run_dir) / "results"),
+            "logs_dir": str(resolve(run_dir) / "logs"),
         }
 
         self.sigterm = False
@@ -84,9 +84,10 @@ def __init__(self, **kwargs):
         self.samplers = {}
         self.loaders = {}
         self.early_stopper = EarlyStopper(
-            patience=15,
-            min_abs_change=1e-5,
+            patience=self.config["optim"].get("es_patience") or 15,
+            min_abs_change=self.config["optim"].get("es_min_abs_change") or 1e-5,
             min_lr=self.config["optim"].get("min_lr", -1),
+            warmup_epochs=self.config["optim"].get("es_warmup_epochs") or -1,
         )
 
         if torch.cuda.is_available() and not self.cpu:
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 5a6ed887b1..ef7fc5fff6 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -334,7 +334,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                             training_state=False,
                         )
                     if self.early_stopper.should_stop(
-                        current_val_metric, self.scheduler.get_lr()
+                        current_val_metric, self.scheduler.get_lr(), self.epoch
                     ):
                         print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n")
                         if self.logger:

From 12c189967987b76e59fd1fa644c00f7a3d527ba7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 17 Jan 2023 15:43:21 -0500
Subject: [PATCH 160/273] enable `--continue_from_dir` arg

---
 main.py                    |   6 --
 ocpmodels/common/flags.py  |   5 +-
 ocpmodels/common/logger.py |   2 +-
 ocpmodels/common/utils.py  | 135 +++++++++++++++++++++++++------------
 4 files changed, 96 insertions(+), 52 deletions(-)

diff --git a/main.py b/main.py
index b3f02d599b..9a756f817a 100644
--- a/main.py
+++ b/main.py
@@ -166,12 +166,6 @@ def run(self, orion_exp=None):
     parser = flags.get_parser()
     args, override_args = parser.parse_known_args()
     args = update_from_sbatch_py_vars(args)
-    if not args.config:
-        args.config = "sfarinet-is2re-10k"
-        # args.checkpoint = "checkpoints/2022-04-26-12-23-28-schnet/checkpoint.pt"
-        warnings.warn(
-            f"\n>>>> No config is provided. Defaulting to {args.config} chosen\n"
-        )
     if args.logdir:
         args.logdir = resolve(args.logdir)
 
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 77cb140cc3..f7da16d626 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -86,6 +86,9 @@ def add_core_args(self):
         self.parser.add_argument(
             "--checkpoint", type=str, help="Model checkpoint to load"
         )
+        self.parser.add_argument(
+            "--continue_from_dir", type=str, help="Run to continue, loading its config"
+        )
         self.parser.add_argument(
             "--timestamp-id",
             default=None,
@@ -109,7 +112,7 @@ def add_core_args(self):
         )
         self.parser.add_argument(
             "--logdir",
-            default="$SCRATCH/ocp/runs/$SLURM_JOB_ID",
+            default=Path("$SCRATCH/ocp/runs/$SLURM_JOB_ID"),
             type=Path,
             help="Where to store logs",
         )
diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index 380401332c..b628704fcf 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -133,7 +133,7 @@ def __init__(self, trainer_config):
 
         self.url = wandb.run.get_url()
         if self.url:
-            with open(Path(self.trainer_config["run_dir"] / "wandb_url.txt"), "w") as f:
+            with open(Path(self.trainer_config["run_dir"]) / "wandb_url.txt", "w") as f:
                 f.write(self.url + "\n")
         if not CLUSTER.drac:
             self.collect_output_files(policy="live")
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index edade03673..68029bf367 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -6,6 +6,7 @@
 """
 
 import ast
+import argparse
 import collections
 import copy
 import glob
@@ -973,6 +974,50 @@ def load_config_legacy(path: str, previous_includes: list = []):
     return config, duplicates_warning, duplicates_error
 
 
+def set_cpus_to_workers(config):
+    if not config.get("no_cpus_to_workers"):
+        cpus = count_cpus()
+        gpus = count_gpus()
+        if cpus is not None:
+            if gpus == 0:
+                workers = cpus - 1
+            else:
+                workers = cpus // gpus
+            if not config["silent"]:
+                print(
+                    f"Overriding num_workers from {config['optim']['num_workers']}",
+                    f"to {workers} to match the machine's CPUs.",
+                    "Use --no_cpus_to_workers=true to disable this behavior.",
+                )
+            config["optim"]["num_workers"] = workers
+    return config
+
+
+def check_regress_forces(config):
+    if "regress_forces" in config["model"]:
+        if config["model"]["regress_forces"] == "":
+            config["model"]["regress_forces"] = False
+        if not isinstance(config["model"]["regress_forces"], str):
+            if config["model"]["regress_forces"] is False:
+                config["model"]["regress_forces"] = ""
+            else:
+                raise ValueError(
+                    "regress_forces must be False or a string: "
+                    + "'from_energy' or 'direct' or 'direct_with_gradient_target'"
+                    + f". Received: `{str(config['model']['regress_forces'])}`"
+                )
+        elif config["model"]["regress_forces"] not in {
+            "from_energy",
+            "direct",
+            "direct_with_gradient_target",
+        }:
+            raise ValueError(
+                "regress_forces must be False or a string: "
+                + "'from_energy' or 'direct' or 'direct_with_gradient_target'"
+                + f". Received: `{str(config['model']['regress_forces'])}`"
+            )
+
+
 def load_config(config_str):
     model, task, split = config_str.split("-")
     conf_path = ROOT / "configs" / "models"
@@ -1002,74 +1047,76 @@ def load_config(config_str):
 
 
 def build_config(args, args_override):
+    config = overrides = continue_config = {}
 
     if args.config_yml:
         raise ValueError(
             "Using LEGACY config format. Please update your config to the new format."
         )
 
-    config = load_config(args.config)
-
-    # Check for overridden parameters.
+    args_dict_with_defaults = {k: v for k, v in vars(args).items() if v is not None}
     if args_override != []:
         overrides = create_dict_from_args(args_override)
-        config = merge_dicts(config, overrides)
 
-    config = merge_dicts(config, {k: v for k, v in vars(args).items() if v is not None})
+    if args.continue_from_dir:
+        cont_dir = Path(args.continue_from_dir)
+        best_ckpt = cont_dir / "checkpoints/best_checkpoint.pt"
+        if not best_ckpt.exists():
+            print(
+                f"💥 Could not find best checkpoint at {str(best_ckpt)}. "
+                + "Please make sure the directory is correct."
+            )
+        else:
+            continue_config = torch.load(str(best_ckpt), map_location="cpu")["config"]
+            continue_config["checkpoint"] = str(
+                sorted(
+                    cont_dir.glob("checkpoints/checkpoint-*.pt"),
+                    key=lambda c: float(c.stem.split("-")[-1]),
+                )[-1]
+            )
+            print(
+                "✅ Loading config from continuing dir and latest checkpoint:",
+                continue_config["checkpoint"],
+            )
+            args.config = continue_config["config"]
+
+    config = load_config(args.config)
+    config = merge_dicts(config, args_dict_with_defaults)
+    config = merge_dicts(config, overrides)
     config["data_split"] = args.config.split("-")[-1]
     config["run_dir"] = resolve(config["run_dir"])
     config["slurm"] = {}
     config["job_id"] = JOB_ID or "no-job-id"
     config["job_ids"] = JOB_ID or "no-job-id"
     config["cluster_name"] = CLUSTER.name
+    config["world_size"] = args.num_nodes * args.num_gpus
 
-    if "regress_forces" in config["model"]:
-        if config["model"]["regress_forces"] == "":
-            config["model"]["regress_forces"] = False
-        if not isinstance(config["model"]["regress_forces"], str):
-            if config["model"]["regress_forces"] is False:
-                config["model"]["regress_forces"] = ""
-            else:
-                raise ValueError(
-                    "regress_forces must be False or a string: "
-                    + "'from_energy' or 'direct' or 'direct_with_gradient_target'"
-                    + f". Received: `{str(config['model']['regress_forces'])}`"
-                )
-        elif config["model"]["regress_forces"] not in {
-            "from_energy",
-            "direct",
-            "direct_with_gradient_target",
-        }:
-            raise ValueError(
-                "regress_forces must be False or a string: "
-                + "'from_energy' or 'direct' or 'direct_with_gradient_target'"
-                + f". Received: `{str(config['model']['regress_forces'])}`"
-            )
+    if continue_config:
+        dirs_k_v = [(k, v) for k, v in config.items() if "dir" in k]
+        dataset_config = copy.deepcopy(config["dataset"])
+        config = merge_dicts(
+            continue_config,
+            {k: resolve(v) if isinstance(v, str) else v for k, v in dirs_k_v},
+        )
+        config["dataset"] = dataset_config
+        config = merge_dicts(config, cli_args_dict())
+        config = merge_dicts(config, overrides)
 
+    check_regress_forces(config)
+    config = set_cpus_to_workers(config)
     config = set_qm9_target_stats(config)
     config = set_qm7x_target_stats(config)
     config = override_drac_paths(config)
 
-    if not config["no_cpus_to_workers"]:
-        cpus = count_cpus()
-        gpus = count_gpus()
-        if cpus is not None:
-            if gpus == 0:
-                workers = cpus - 1
-            else:
-                workers = cpus // gpus
-            if not config["silent"]:
-                print(
-                    f"Overriding num_workers from {config['optim']['num_workers']}",
-                    f"to {workers} to match the machine's CPUs.",
-                    "Use --no_cpus_to_workers=true to disable this behavior.",
-                )
-            config["optim"]["num_workers"] = workers
-    config["world_size"] = args.num_nodes * args.num_gpus
-
     return config
 
 
+def cli_args_dict():
+    dummy = argparse.ArgumentParser()
+    _, cli_args = dummy.parse_known_args()
+    return create_dict_from_args(cli_args)
+
+
 def create_grid(base_config, sweep_file):
     def _flatten_sweeps(sweeps, root_key="", sep="."):
         flat_sweeps = []

From 4d6f41f1bd7b88a9e5930485123f452d504961d8 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 17 Jan 2023 15:44:08 -0500
Subject: [PATCH 161/273] use resolve

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 68029bf367..62a9104692 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1059,7 +1059,7 @@ def build_config(args, args_override):
         overrides = create_dict_from_args(args_override)
 
     if args.continue_from_dir:
-        cont_dir = Path(args.continue_from_dir)
+        cont_dir = resolve(args.continue_from_dir)
         best_ckpt = cont_dir / "checkpoints/best_checkpoint.pt"
         if not best_ckpt.exists():
             print(

From ec1fa92b5cc89e09d35616c844288bb92c051eae Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 17 Jan 2023 15:54:38 -0500
Subject: [PATCH 162/273] improve

---
 ocpmodels/common/utils.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 62a9104692..8aafd6c51e 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1060,24 +1060,19 @@ def build_config(args, args_override):
 
     if args.continue_from_dir:
         cont_dir = resolve(args.continue_from_dir)
-        best_ckpt = cont_dir / "checkpoints/best_checkpoint.pt"
-        if not best_ckpt.exists():
+        ckpts = list(cont_dir.glob("checkpoints/checkpoint-*.pt"))
+        if not ckpts:
             print(
-                f"💥 Could not find best checkpoint at {str(best_ckpt)}. "
+                f"💥 Could not find checkpoints in {str(cont_dir)}. "
                 + "Please make sure the directory is correct."
             )
         else:
-            continue_config = torch.load(str(best_ckpt), map_location="cpu")["config"]
-            continue_config["checkpoint"] = str(
-                sorted(
-                    cont_dir.glob("checkpoints/checkpoint-*.pt"),
-                    key=lambda c: float(c.stem.split("-")[-1]),
-                )[-1]
-            )
-            print(
-                "✅ Loading config from continuing dir and latest checkpoint:",
-                continue_config["checkpoint"],
+            latest_ckpt = str(
+                sorted(ckpts, key=lambda c: float(c.stem.split("-")[-1]))[-1]
             )
+            continue_config["checkpoint"] = str(latest_ckpt)
+            continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
+            print("✅ Loading config from cont dir and latest checkpoint:", latest_ckpt)
             args.config = continue_config["config"]
 
     config = load_config(args.config)

From 5777bd257be64a467c2f9b4ad3dbd27128523423 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 17 Jan 2023 15:56:15 -0500
Subject: [PATCH 163/273] update

---
 configs/exps/icml/qm9/fanet-manual.yaml | 12 +++---
 configs/exps/qm7x/schnet-fanet.yaml     | 57 +++++++++++++++++++------
 2 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml
index 0fc5b23986..9a26c42d33 100644
--- a/configs/exps/icml/qm9/fanet-manual.yaml
+++ b/configs/exps/icml/qm9/fanet-manual.yaml
@@ -25,18 +25,20 @@ default:
     warmup_steps: 3000
     # parameters EMA
     ema_decay: 0.999
-    decay_steps: max_steps
-    scheduler: LinearWarmupCosineAnnealingLR
     batch_size: 64
-    initial_lr: 0.001
+    initial_lr: 0.0005
     max_epochs: 1500
     loss_energy: mse
     loss_force: mse
+    # early stopping
+    es_patience: 20
+    es_es_min_abs_change: 0.000001
+    es_warmup_epochs: 500
     # all below is for the scheduler
     scheduler: ReduceLROnPlateau
     mode: min
     factor: 0.5
-    threshold: 0.001
+    threshold: 0.0001
     threshold_mode: abs
     min_lr: 0.00001
     verbose: true
@@ -70,8 +72,6 @@ runs:
       cutoff: 6.0
   - optim:
       batch_size: 128
-  - optim:
-      batch_size: 512
   - model:
       energy_head: ""
   - model:
diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml
index 8adab7d58a..165a492eff 100644
--- a/configs/exps/qm7x/schnet-fanet.yaml
+++ b/configs/exps/qm7x/schnet-fanet.yaml
@@ -1,7 +1,7 @@
 # trainset has 4068193 samples
 job:
   mem: 32GB
-  cpus: 8
+  cpus: 4
   gres: gpu:16gb:1
   partition: long
   code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
@@ -25,15 +25,16 @@ default:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions
     optim: batch_size, lr_initial
   optim:
-    batch_size: 512
-    warmup_steps: 1000
+    batch_size: 32
+    max_epochs: 100
+    warmup_steps: 3000
     lr_initial: 0.0005
     eval_every: 0.34
     # parameters EMA
     ema_decay: 0.999
-    energy_coefficient: 1
-    energy_grad_coefficient: 10
-    force_coefficient: 100
+    energy_coefficient: 0.001
+    energy_grad_coefficient: 0
+    force_coefficient: 0.999
     loss_energy: mse
     loss_force: mse
     # all below is for the scheduler
@@ -58,26 +59,56 @@ runs:
       lr_initial: 0.001
       batch_size: 1024
   - optim:
-      batch_size: 256
-  - optim:
-      lr_initial: 0.001
-  - optim:
-      lr_initial: 0.001
-      batch_size: 2048
+      batch_size: 128
   - model:
       hidden_channels: 512
   - model:
       num_interactions: 6
-      num_gaussians: 20
+      num_gaussians: 50
       hidden_channels: 512
+
   - config: fanet-qm7x-all
+    optim:
+      initial_lr: 0.0005
+      energy_coefficient: 0.001
+      energy_grad_coefficient: 0.01
+      force_coefficient: 0.989
     model:
+      graph_norm: true
       force_decoder_type: mlp
       edge_embed_type: all_rij
       regress_forces: direct_with_gradient_target
   - config: fanet-qm7x-all
+    optim:
+      initial_lr: 0.0001
+      energy_coefficient: 0.001
+      energy_grad_coefficient: 0.01
+      force_coefficient: 0.989
     model:
+      graph_norm: true
       force_decoder_type: mlp
       edge_embed_type: all_rij
+      regress_forces: direct_with_gradient_target
+  - config: fanet-qm7x-all
+    optim:
+      initial_lr: 0.0001
+      energy_coefficient: 0.001
+      energy_grad_coefficient: 0.01
+      force_coefficient: 0.989
+    model:
+      graph_norm: false
+      force_decoder_type: mlp
+      edge_embed_type: all_rij
+      regress_forces: direct_with_gradient_target
+  - config: fanet-qm7x-all
+    optim:
+      initial_lr: 0.0001
+      energy_coefficient: 0.001
+      energy_grad_coefficient: 0.01
+      force_coefficient: 0.989
+    model:
+      graph_norm: true
+      force_decoder_type: mlp
       mp_type: updownscale_base
+      edge_embed_type: all_rij
       regress_forces: direct_with_gradient_target

From 9dfcce642c71f5caf557784421b63d74f6e26583 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 17 Jan 2023 16:08:44 -0500
Subject: [PATCH 164/273] update fanet orion qm9

---
 configs/exps/icml/qm9/fanet-manual.yaml    |  2 +-
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 27 +++++++++++++++-------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml
index 9a26c42d33..bf63636e08 100644
--- a/configs/exps/icml/qm9/fanet-manual.yaml
+++ b/configs/exps/icml/qm9/fanet-manual.yaml
@@ -32,7 +32,7 @@ default:
     loss_force: mse
     # early stopping
     es_patience: 20
-    es_es_min_abs_change: 0.000001
+    es_min_abs_change: 0.000001
     es_warmup_epochs: 500
     # all below is for the scheduler
     scheduler: ReduceLROnPlateau
diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index 97aa0a69bf..88e871f0a4 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -14,19 +14,29 @@ default:
   wandb_tags: qm9, orion
   log_train_every: 100
   optim:
-    warmup_steps: 2000
+    warmup_steps: 3000
     # parameters EMA
     ema_decay: 0.999
-    decay_steps: max_steps
-    scheduler: LinearWarmupCosineAnnealingLR
-    batch_size: 64
+    loss_energy: mse
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.000001
+    es_warmup_epochs: 500
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.5
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.00001
+    verbose: true
   note:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
     optim: batch_size, lr_initial
     _root_: frame_averaging, fa_frames
   orion_mult_factor:
     value: 32
-    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, batch_size
   frame_averaging: 3D
   fa_frames: random
   model:
@@ -36,11 +46,12 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-qm9-v2.0.0
+  unique_exp_name: fanet-qm9-v3.0.0
 
   space:
-    optim/max_epochs: fidelity(100, 2000, base=5)
-    optim/lr_initial: loguniform(1e-4, 5e-3, precision=2)
+    optim/max_epochs: fidelity(150, 2000, base=5)
+    optim/batch_size: uniform(1, 4, discrete=True)
+    optim/lr_initial: loguniform(1e-4, 1e-3, precision=3)
     model/graph_norm: choices([True, False])
     model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
     model/hidden_channels: uniform(5, 16, discrete=True)

From b7896c5f515e155d97be6fd98c1b16906d21526b Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 17 Jan 2023 16:40:52 -0500
Subject: [PATCH 165/273] fix model name

---
 configs/exps/qm7x/schnet-fanet.yaml | 2 +-
 ocpmodels/trainers/base_trainer.py  | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml
index 165a492eff..dfa52ea54f 100644
--- a/configs/exps/qm7x/schnet-fanet.yaml
+++ b/configs/exps/qm7x/schnet-fanet.yaml
@@ -35,7 +35,7 @@ default:
     energy_coefficient: 0.001
     energy_grad_coefficient: 0
     force_coefficient: 0.999
-    loss_energy: mse
+    loss_energy: mae
     loss_force: mse
     # all below is for the scheduler
     scheduler: ReduceLROnPlateau
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index a75db63bbe..71ebca2585 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -52,7 +52,9 @@ class BaseTrainer(ABC):
     def __init__(self, **kwargs):
 
         run_dir = kwargs["run_dir"]
-        model_name = kwargs["model"].pop("name", kwargs["model_name"])
+        model_name = kwargs["model"].pop(
+            "name", kwargs.get("model_name", "Unknown - base_trainer issue")
+        )
         kwargs["model"]["graph_rewiring"] = kwargs.get("graph_rewiring")
 
         self.config = {

From 1d3ca4831d6be405f6960fc494492380eeb01bf5 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 16:24:22 -0500
Subject: [PATCH 166/273] improve exp manager

---
 configs/exps/icml/qm9/fanet-manual.yaml |  29 +++--
 configs/exps/qm7x/schnet-fanet.yaml     | 153 ++++++++++++++----------
 main.py                                 |   4 -
 ocpmodels/common/exp_manager.py         | 137 ++++++++++++++++++++-
 4 files changed, 243 insertions(+), 80 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-manual.yaml b/configs/exps/icml/qm9/fanet-manual.yaml
index bf63636e08..79a65f829b 100644
--- a/configs/exps/icml/qm9/fanet-manual.yaml
+++ b/configs/exps/icml/qm9/fanet-manual.yaml
@@ -13,12 +13,9 @@ default:
   wandb_tags: qm9, fanet-qm9-v1.0.2-continued
   log_train_every: 100
   note:
-    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
-    optim: batch_size, lr_initial
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type
+    optim: batch_size
     _root_: frame_averaging, fa_frames
-  orion_mult_factor:
-    value: 32
-    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels
   frame_averaging: 3D
   fa_frames: random
   optim:
@@ -31,17 +28,18 @@ default:
     loss_energy: mse
     loss_force: mse
     # early stopping
-    es_patience: 20
+    es_patience: 50
     es_min_abs_change: 0.000001
     es_warmup_epochs: 500
-    # all below is for the scheduler
+    # all below is for the ReduceLROnPlateau scheduler
     scheduler: ReduceLROnPlateau
     mode: min
-    factor: 0.5
+    factor: 0.75
     threshold: 0.0001
     threshold_mode: abs
-    min_lr: 0.00001
+    min_lr: 0.000001
     verbose: true
+    patience: 10
   model:
     cutoff: 5.0
     edge_embed_type: all_rij
@@ -54,7 +52,7 @@ default:
     num_gaussians: 50
     num_interactions: 5
     otf_graph: false
-    pg_hidden_channels: 32
+    pg_hidden_channels: 16
     phys_embeds: true
     phys_hidden_channels: 0
     second_layer_MLP: false
@@ -76,3 +74,14 @@ runs:
       energy_head: ""
   - model:
       energy_head: "weighted-av-initial-embeds"
+  - model:
+      num_interactions: 4
+      num_gaussians: 20
+      hidden_channels: 512
+      num_filters: 512
+  - model:
+      mp_type: updownscale_base
+      num_interactions: 4
+      num_gaussians: 20
+      hidden_channels: 512
+      num_filters: 512
diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml
index dfa52ea54f..ece614e5f3 100644
--- a/configs/exps/qm7x/schnet-fanet.yaml
+++ b/configs/exps/qm7x/schnet-fanet.yaml
@@ -25,16 +25,16 @@ default:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions
     optim: batch_size, lr_initial
   optim:
-    batch_size: 32
+    batch_size: 10
     max_epochs: 100
     warmup_steps: 3000
-    lr_initial: 0.0005
+    lr_initial: 0.0001
     eval_every: 0.34
     # parameters EMA
     ema_decay: 0.999
-    energy_coefficient: 0.001
+    energy_coefficient: 0.01
     energy_grad_coefficient: 0
-    force_coefficient: 0.999
+    force_coefficient: 0.99
     loss_energy: mae
     loss_force: mse
     # all below is for the scheduler
@@ -46,69 +46,102 @@ default:
     min_lr: 0.000001
     verbose: true
   model:
-    hidden_channels: 256
+    hidden_channels: 128
     num_filters: 128
-    num_gaussians: 100
+    num_gaussians: 20
     num_interactions: 6
     cutoff: 5.0
-    regress_forces: "from_energy"
+    regress_forces: from_energy
 
 runs:
-  - {}
-  - optim:
-      lr_initial: 0.001
-      batch_size: 1024
-  - optim:
-      batch_size: 128
-  - model:
-      hidden_channels: 512
-  - model:
-      num_interactions: 6
-      num_gaussians: 50
-      hidden_channels: 512
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
 
-  - config: fanet-qm7x-all
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
     optim:
-      initial_lr: 0.0005
-      energy_coefficient: 0.001
-      energy_grad_coefficient: 0.01
-      force_coefficient: 0.989
-    model:
-      graph_norm: true
-      force_decoder_type: mlp
-      edge_embed_type: all_rij
-      regress_forces: direct_with_gradient_target
-  - config: fanet-qm7x-all
+      lr_initial: 0.001
+      batch_size: 100
+
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
     optim:
-      initial_lr: 0.0001
-      energy_coefficient: 0.001
-      energy_grad_coefficient: 0.01
-      force_coefficient: 0.989
-    model:
-      graph_norm: true
-      force_decoder_type: mlp
-      edge_embed_type: all_rij
-      regress_forces: direct_with_gradient_target
-  - config: fanet-qm7x-all
+      lr_initial: 0.001
+      batch_size: 256
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: False
+
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: False
     optim:
-      initial_lr: 0.0001
-      energy_coefficient: 0.001
-      energy_grad_coefficient: 0.01
-      force_coefficient: 0.989
-    model:
-      graph_norm: false
-      force_decoder_type: mlp
-      edge_embed_type: all_rij
-      regress_forces: direct_with_gradient_target
-  - config: fanet-qm7x-all
+      lr_initial: 0.001
+      batch_size: 100
+
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: False
     optim:
-      initial_lr: 0.0001
-      energy_coefficient: 0.001
-      energy_grad_coefficient: 0.01
-      force_coefficient: 0.989
-    model:
-      graph_norm: true
-      force_decoder_type: mlp
-      mp_type: updownscale_base
-      edge_embed_type: all_rij
-      regress_forces: direct_with_gradient_target
+      lr_initial: 0.001
+      batch_size: 256
+
+  # - config: fanet-qm7x-all
+  #   model:
+  #     graph_norm: true
+  #     edge_embed_type: all_rij
+  #     mp_type: updownscale_base
+
+  # - config: fanet-qm7x-all
+  #   optim:
+  #     energy_coefficient: 0.01
+  #     energy_grad_coefficient: 0.1
+  #     force_coefficient: 0.89
+  #     lr_initial: 0.001
+  #     batch_size: 100
+  #   model:
+  #     graph_norm: true
+  #     edge_embed_type: all_rij
+  #     mp_type: updownscale_base
+  #     force_decoder_type: mlp
+  #     regress_forces: direct_with_gradient_target
+
+  # - config: fanet-qm7x-all
+  #   optim:
+  #     energy_coefficient: 0.01
+  #     energy_grad_coefficient: 0.1
+  #     force_coefficient: 0.89
+  #     lr_initial: 0.001
+  #     batch_size: 100
+  #   model:
+  #     graph_norm: false
+  #     force_decoder_type: mlp
+  #     edge_embed_type: all_rij
+  #     regress_forces: direct_with_gradient_target
+  #     num_interactions: 4
+
+  # - config: fanet-qm7x-all
+  #   optim:
+  #     energy_coefficient: 0.01
+  #     energy_grad_coefficient: 0.1
+  #     force_coefficient: 0.89
+  #     lr_initial: 0.001
+  #     batch_size: 100
+  #   model:
+  #     graph_norm: true
+  #     force_decoder_type: mlp
+  #     mp_type: updownscale_base
+  #     edge_embed_type: all_rij
+  #     regress_forces: direct_with_gradient_target
+  #     num_interactions: 3
+  #     num_filters: 256
+  #     hidden_channels: 256
diff --git a/main.py b/main.py
index 9a756f817a..2f1d88f939 100644
--- a/main.py
+++ b/main.py
@@ -112,10 +112,6 @@ def run(self, orion_exp=None):
         self.hparams, orion_race_condition = dist_utils.broadcast_from_master(
             self.hparams, orion_race_condition
         )
-        if orion_race_condition:
-            if dist_utils.is_master():
-                shutil.rmtree(self.trainer_config["run_dir"])
-            return
         if self.hparams:
             print("\n💎 Received hyper-parameters from Orion:")
             print(dump(self.hparams), end="\n")
diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index f59a540919..a89574096f 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -8,8 +8,13 @@
 import sys
 import time
 from datetime import datetime
+import yaml
+from tqdm import tqdm
 
-rundir = Path(os.environ["SCRATCH"]) / "ocp" / "runs"
+RUN_DIR = Path(os.environ["SCRATCH"]) / "ocp" / "runs"
+ROOT = Path(__file__).resolve().parent.parent.parent
+EXP_OUT_DIR = ROOT / "data" / "exp_outputs"
+MANAGER_CACHE = ROOT / "data" / "exp_manager_cache"
 
 
 class Manager:
@@ -18,9 +23,13 @@ def __init__(
         orion_db_path="",
         name="",
         wandb_path="mila-ocp/ocp-qm",
+        rebuild_cache=False,
+        print_tracebacks=True,
     ):
         self.api = wandb.Api()
         self.wandb_path = wandb_path
+        self.rebuild_cache = rebuild_cache
+        self.print_tracebacks = print_tracebacks
         self.wandb_runs = [
             r
             for r in self.api.runs(wandb_path)
@@ -28,6 +37,12 @@ def __init__(
             and name in r.config.get("orion_exp_config_path", "")
         ]
         self.name = name
+        self.cache_path = MANAGER_CACHE / f"{self.name}.yaml"
+        self.cache = (
+            yaml.safe_load(self.cache_path.read_text())
+            if self.cache_path.exists()
+            else {}
+        )
         self.trial_hparams_to_rundirs = defaultdict(list)
         self.exp = get_experiment(
             name=name,
@@ -63,7 +78,12 @@ def __init__(
             [p.name for runs in self.trial_hparams_to_rundirs.values() for p in runs]
         )
         print("\n")
+        self.discover_yamls()
+        self.discover_job_ids_from_yaml()
+        self.parse_output_files()
         self.print_status()
+        print("\n")
+        self.print_output_files_stats()
 
     def print_status(self):
         print("{:32} : {:4} ".format("Trials in experiment", len(self.trials)))
@@ -120,7 +140,7 @@ def print_status(self):
         )
         running = set(self.job_ids) & sq
         waiting = (
-            set([j.parent.name for j in rundir.glob(f"*/{self.name}.exp")]) & sq
+            set([j.parent.name for j in RUN_DIR.glob(f"*/{self.name}.exp")]) & sq
         ) - running
         print(
             "{:32} : {}".format(
@@ -136,7 +156,7 @@ def print_status(self):
         )
 
     def discover_run_dirs(self):
-        for unique in rundir.glob(f"*/{self.name}--*.unique"):
+        for unique in RUN_DIR.glob(f"*/{self.name}--*.unique"):
             self.trial_hparams_to_rundirs[unique.stem.split("--")[-1]].append(
                 unique.parent
             )
@@ -167,6 +187,100 @@ def get_reserved_wandb_runs(self):
     def print_wandb_query(self):
         print(f"{'WandB runs query:':32}\n" + "(" + "|".join(self.job_ids) + ")")
 
+    def parse_output_files(self):
+        if "job_state" not in self.cache:
+            self.cache["job_state"] = {}
+        for j in tqdm(self.cache["all_job_ids"], desc="Parsing output files"):
+            if j in self.cache["job_state"] and not self.rebuild_cache:
+                continue
+            out_file = RUN_DIR / j / "output-0.txt"
+
+            if not out_file.exists():
+                self.cache["job_state"][j] = "No output file (RaceCondition)"
+                continue
+
+            out_txt = out_file.read_text()
+            if "RaceCondition" in out_txt:
+                self.cache["job_state"][j] = "RaceCondition"
+            elif "Traceback" in out_txt:
+                self.cache["job_state"][j] = (
+                    "Traceback: " + out_txt.split("Traceback")[1]
+                )
+            elif "srun: Job step aborted" in out_txt:
+                if "slurmstepd" in out_txt and " CANCELLED AT " in out_txt:
+                    self.cache["job_state"][j] = "Cancelled"
+            elif "eval_all_splits" in out_txt and "Final results" in out_txt:
+                self.cache["job_state"][j] = "Finished"
+            elif "nan_loss" in out_txt:
+                self.cache["job_state"][j] = "NaN loss"
+            else:
+                self.cache["job_state"][j] = "Unknown"
+        self.commit_cache()
+
+    def print_output_files_stats(self):
+        print("Job status from output files:\n" + "-" * 29 + "\n")
+        stats = {}
+        for j, o in self.cache["job_state"].items():
+            if "Traceback" in o:
+                if "Traceback" not in stats:
+                    stats["Traceback"] = {"n": 0, "ids": [], "contents": []}
+                stats["Traceback"]["n"] += 1
+                stats["Traceback"]["ids"].append(j)
+                stats["Traceback"]["contents"].append(o)
+            else:
+                if o not in stats:
+                    stats[o] = {"n": 0, "ids": []}
+                stats[o]["n"] += 1
+                stats[o]["ids"].append(j)
+        for s, v in stats.items():
+            print(f"• {s:31}" + f": {v['n']} (" + " ".join(v["ids"]) + ")")
+        if stats["Traceback"]["n"] > 0 and self.print_tracebacks:
+            print("\nTraceback contents:\n" + "-" * 19 + "\n")
+            print(
+                f"\n\n{'|' * 50}\n{'|' * 50}\n{'|' * 50}\n".join(
+                    f"{j}:\n{o}"
+                    for j, o in zip(
+                        stats["Traceback"]["ids"], stats["Traceback"]["contents"]
+                    )
+                )
+            )
+
+    def discover_job_ids_from_yaml(self):
+        all_jobs = (
+            set(self.cache.get("all_job_ids", [])) if not self.rebuild_cache else set()
+        )
+        for yaml_path in self.cache["exp_yamls"]:
+            lines = Path(yaml_path).read_text().splitlines()
+            jobs_line = [line for line in lines if "All jobs launched" in line][0]
+            jobs = [
+                j.strip()
+                for j in jobs_line.split("All jobs launched: ")[-1].strip().split(", ")
+            ]
+            all_jobs |= set(jobs)
+        self.cache["all_job_ids"] = sorted(all_jobs)
+        self.commit_cache()
+
+    def discover_yamls(self):
+        yamls = set()
+        if self.cache and not self.rebuild_cache:
+            cache_yamls = self.cache.get("exp_yamls") or []
+            yamls |= set(cache_yamls)
+        for yaml_conf in EXP_OUT_DIR.glob("**/*.yaml"):
+            if str(yaml_conf) not in yamls:
+                yaml_txt = yaml_conf.read_text()
+                if self.name in yaml_txt:
+                    y = yaml.safe_load(yaml_txt)
+                    if y.get("orion", {}).get("unique_exp_name") == self.name:
+                        yamls.add(str(yaml_conf))
+        yamls = sorted(yamls)
+        self.cache["exp_yamls"] = yamls
+        self.commit_cache()
+
+    def commit_cache(self):
+        if not self.cache_path.parent.exists():
+            self.cache_path.parent.mkdir(parents=True)
+        self.cache_path.write_text(yaml.safe_dump(self.cache))
+
     @classmethod
     def help(self):
         return dedent(
@@ -175,9 +289,11 @@ def help(self):
         Manager init()
         --------------
 
-        orion_db_path -> (str or pathlib.Path) pointing to the orion db pickle file
-        name          -> (str) unique orion experiment name in the db
-        wandb_path    -> (str) path to the wandb project like "{entity}/{project}"
+        orion_db_path    -> (str or pathlib.Path) pointing to the orion db pickle file
+        name             -> (str) unique orion experiment name in the db
+        wandb_path       -> (str) path to the wandb project like "{entity}/{project}"
+        rebuild_cache    -> (bool, default: False) if True, will rebuild the output file cache from scratch
+        print_tracebacks -> (bool, default: False) if True, will print the Traceback contents in the output files
 
         ----------
         Attributes
@@ -220,6 +336,8 @@ def help(self):
         "name": None,
         "wandb_path": None,
         "watch": -1,
+        "rebuild_cache": False,
+        "print_tracebacks": False,
     }
     args = resolved_args(defaults=defaults)
     if args.help:
@@ -230,6 +348,11 @@ def help(self):
             "In [1]: run ocpmodels/common/exp_manager.py",
             "name='ocp-qm9-orion-debug-v1.0.0' wandb_path='mila-ocp/ocp-3'",
         )
+        print(
+            "In [1]: run ocpmodels/common/exp_manager.py",
+            "name='ocp-qm9-orion-debug-v1.0.0' wandb_path='mila-ocp/ocp-3'",
+            "print_tracebacks",
+        )
         print("\n\n🧞 Manager help:")
         print(Manager.help())
         sys.exit(0)
@@ -257,6 +380,8 @@ def help(self):
         name=args.name,
         wandb_path=args.wandb_path,
         orion_db_path=orion_db_path,
+        rebuild_cache=args.rebuild_cache,
+        print_tracebacks=args.print_tracebacks,
     )
 
     # m.print_wandb_query()

From a7a0dee8514854263d77c5c05119c91f71643ce7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 17:24:09 -0500
Subject: [PATCH 167/273] use `get_and_move_orion_db_path`

---
 ocpmodels/common/exp_manager.py |  8 ++------
 ocpmodels/common/utils.py       | 32 +++++++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index a89574096f..b90e473742 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -10,9 +10,8 @@
 from datetime import datetime
 import yaml
 from tqdm import tqdm
+from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path
 
-RUN_DIR = Path(os.environ["SCRATCH"]) / "ocp" / "runs"
-ROOT = Path(__file__).resolve().parent.parent.parent
 EXP_OUT_DIR = ROOT / "data" / "exp_outputs"
 MANAGER_CACHE = ROOT / "data" / "exp_manager_cache"
 
@@ -372,10 +371,7 @@ def help(self):
         "💃 Status of experiment",
         f"'{args.name}' and wandb entity/project '{args.wandb_path}':",
     )
-    orion_db_path = str(
-        Path(__file__).resolve().parent.parent.parent
-        / f"data/orion/storage/{args.name}_db.pkl"
-    )
+    orion_db_path = get_and_move_orion_db_path(args.name)
     m = Manager(
         name=args.name,
         wandb_path=args.wandb_path,
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 8aafd6c51e..8c89c73a22 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -5,8 +5,8 @@
 LICENSE file in the root directory of this source tree.
 """
 
-import ast
 import argparse
+import ast
 import collections
 import copy
 import glob
@@ -24,6 +24,7 @@
 from functools import wraps
 from itertools import product
 from pathlib import Path
+from shutil import copyfile
 
 import numpy as np
 import torch
@@ -66,6 +67,7 @@ def __getattr__(self, k: str):
 OCP_TASKS = {"s2ef", "is2re", "is2es"}
 ROOT = Path(__file__).resolve().parent.parent.parent
 JOB_ID = os.environ.get("SLURM_JOB_ID")
+RUN_DIR = Path(os.environ["SCRATCH"]) / "ocp" / "runs"
 
 
 def set_max_fidelity(hparams, orion_exp):
@@ -137,6 +139,31 @@ def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."):
     return updated_hparams
 
 
+def get_and_move_orion_db_path(exp_name):
+    db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."])
+    db_file = f"{db_id}_db.pkl" if not db_id.endswith("_db.pkl") else db_id
+    scratch_db = RUN_DIR.parent / "orion" / "storage" / db_file
+    scratch_db.parent.mkdir(parents=True, exist_ok=True)
+    if not scratch_db.exists():
+        home_db = ROOT / f"data/orion/storage/{db_file}"
+
+        if not home_db.exists():
+            return scratch_db
+
+        lock_file = home_db.parent / f"{db_file}.lock"
+        if not lock_file.exists():
+            lock_file.touch()
+            copyfile(home_db, scratch_db)
+            print("Copied db from home to scratch.")
+            lock_file.unlink()
+
+        while lock_file.exists():
+            print("Waiting for lock to be released...")
+            time.sleep(1)
+
+    return scratch_db
+
+
 def load_orion_exp(args):
     exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text())
 
@@ -147,8 +174,7 @@ def load_orion_exp(args):
     print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}")
     exp_name = args.orion_unique_exp_name or exp_config["unique_exp_name"]
     db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."])
-    db_path = ROOT / "data" / "orion" / "storage" / f"{db_id}_db.pkl"
-    db_path.parent.mkdir(parents=True, exist_ok=True)
+    db_path = get_and_move_orion_db_path(db_id)
     experiment = build_experiment(
         storage={
             "database": {

From bf09c2c223fe832b8e01006e392921485049090d Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 17:40:49 -0500
Subject: [PATCH 168/273] cp lock

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 8c89c73a22..a62fa3f903 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -150,7 +150,7 @@ def get_and_move_orion_db_path(exp_name):
         if not home_db.exists():
             return scratch_db
 
-        lock_file = home_db.parent / f"{db_file}.lock"
+        lock_file = home_db.parent / f"{db_file}.cp_lock"
         if not lock_file.exists():
             lock_file.touch()
             copyfile(home_db, scratch_db)

From f687fea87311430fd1322650fc221c6884802ee1 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 17:55:44 -0500
Subject: [PATCH 169/273] handle symlink

---
 ocpmodels/common/utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index a62fa3f903..4c61943cbb 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -24,7 +24,7 @@
 from functools import wraps
 from itertools import product
 from pathlib import Path
-from shutil import copyfile
+from shutil import copyfile, move
 
 import numpy as np
 import torch
@@ -154,7 +154,9 @@ def get_and_move_orion_db_path(exp_name):
         if not lock_file.exists():
             lock_file.touch()
             copyfile(home_db, scratch_db)
-            print("Copied db from home to scratch.")
+            move(home_db, home_db.parent / f"{db_file}.bak")
+            os.symlink(str(scratch_db), str(home_db))
+            print("Copied and symlinked db from home to scratch.")
             lock_file.unlink()
 
         while lock_file.exists():

From c24b7ee0eaed82d7fcddeb008f056a9acfde8db7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:06:50 -0500
Subject: [PATCH 170/273] add `rescale_with_hof` option

---
 .../models/qm7x-metadata/hof_rescales.json    |  1 +
 configs/models/tasks/qm7x.yaml                |  1 +
 main.py                                       | 10 ----
 ocpmodels/common/utils.py                     | 44 +++++++++++-----
 ocpmodels/datasets/qm7x.py                    | 29 ++++++++---
 ocpmodels/modules/normalizer.py               | 32 ++++++++++--
 ocpmodels/trainers/base_trainer.py            |  4 ++
 ocpmodels/trainers/single_trainer.py          | 29 +++++++++--
 scripts/compute_qm7x_rescales.py              | 51 +++++++++++++++++++
 9 files changed, 163 insertions(+), 38 deletions(-)
 create mode 100644 configs/models/qm7x-metadata/hof_rescales.json
 create mode 100644 scripts/compute_qm7x_rescales.py

diff --git a/configs/models/qm7x-metadata/hof_rescales.json b/configs/models/qm7x-metadata/hof_rescales.json
new file mode 100644
index 0000000000..2a0d05d0ee
--- /dev/null
+++ b/configs/models/qm7x-metadata/hof_rescales.json
@@ -0,0 +1 @@
+{"mean": -1.373329520225525, "std": 0.3661123216152191, "about": "Statistics for y(=ePBE0+MBD) / sum(HOF) where HOF is the heat of formation of each element in the graph. This is computed over the train set only."}
\ No newline at end of file
diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml
index 81d9ace719..c832e2c82f 100644
--- a/configs/models/tasks/qm7x.yaml
+++ b/configs/models/tasks/qm7x.yaml
@@ -36,6 +36,7 @@ default:
       target: ePBE0+MBD
       forces_target: totFOR
       std_divider: 1.0
+      rescale_with_hof: True
     val_id:
       src: /network/projects/ocp/qm7x/processed
       normalize_labels: True # mean and std of target will be set by utils.py if this is True
diff --git a/main.py b/main.py
index 2f1d88f939..dc763ce3de 100644
--- a/main.py
+++ b/main.py
@@ -43,16 +43,6 @@
 # os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 torch.multiprocessing.set_sharing_strategy("file_system")
 
-try:
-    import ipdb  # noqa: F401
-
-    os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace"
-except:  # noqa: E722
-    print(
-        "`ipdb` is not installed. ",
-        "Consider `pip install ipdb` to improve your debugging experience.",
-    )
-
 
 def print_warnings():
     warnings = [
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 4c61943cbb..b7e337843f 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -515,6 +515,13 @@ def set_qm7x_target_stats(trainer_config):
         (ROOT / "configs" / "models" / "qm7x-metadata" / "stats.json").read_text()
     )
 
+    hof_stats = json.loads(
+        (
+            ROOT / "configs" / "models" / "qm7x-metadata" / "hof_rescales.json"
+        ).read_text()
+    )
+    hof_stats.pop("about", None)
+
     for d, dataset in deepcopy(trainer_config["dataset"]).items():
         if d == "default_val":
             continue
@@ -534,6 +541,11 @@ def set_qm7x_target_stats(trainer_config):
             trainer_config["dataset"][d]["grad_target_mean"] = mean
             trainer_config["dataset"][d]["grad_target_std"] = std / std_divider
 
+    if "train" in trainer_config["dataset"] and trainer_config["dataset"]["train"].get(
+        "rescale_with_hof"
+    ):
+        trainer_config["dataset"]["train"]["hof_rescales"] = hof_stats
+
     return trainer_config
 
 
@@ -845,6 +857,16 @@ def add_edge_distance_to_graph(
 def setup_imports():
     from ocpmodels.common.registry import registry
 
+    try:
+        import ipdb  # noqa: F401
+
+        os.environ["PYTHONBREAKPOINT"] = "ipdb.set_trace"
+    except:  # noqa: E722
+        print(
+            "`ipdb` is not installed. ",
+            "Consider `pip install ipdb` to improve your debugging experience.",
+        )
+
     # First, check if imports are already setup
     has_already_setup = registry.get("imports_setup", no_warning=True)
     if has_already_setup:
@@ -1651,22 +1673,18 @@ def get_commit_hash():
 
 
 def base_config(config, overrides={}):
-    from argparse import Namespace
+    from ocpmodels.common.flags import flags
 
-    n = Namespace()
-    n.num_gpus = 1
-    n.num_nodes = 1
-    n.config_yml = None
-    n.config = config
+    setup_imports()
 
     conf = build_config(
-        n,
-        [
-            "run_dir=.",
-            "no_qm7x_cp=true",
-            "no_cpus_to_workers=true",
-            "silent=",
-        ],
+        *flags.get_parser().parse_known_args(
+            [
+                f"--config={config}",
+                "--logger=dummy",
+            ]
+        )
     )
+    conf["cpu"] = not torch.cuda.is_available()
 
     return merge_dicts(conf, overrides)
diff --git a/ocpmodels/datasets/qm7x.py b/ocpmodels/datasets/qm7x.py
index 6ecf76ded9..f98774a60c 100644
--- a/ocpmodels/datasets/qm7x.py
+++ b/ocpmodels/datasets/qm7x.py
@@ -1,24 +1,28 @@
-import time
-from torch.utils.data import Dataset
+import pickle
 import random
 import re
+import time
 from abc import abstractmethod
 from collections import defaultdict
 from collections.abc import Iterable
 from pathlib import Path
-import pickle
+
 import h5py
+import lmdb
 import numpy as np
 import torch
+from mendeleev.fetch import fetch_table
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from scipy import spatial as sp
 from torch import as_tensor
+from torch.utils.data import Dataset
 from torch_geometric.data import Data
-from cosmosis.dataset import CDataset
 from tqdm import tqdm
-import lmdb
+
+from cosmosis.dataset import CDataset
 from ocpmodels.common.registry import registry
+from ocpmodels.common.utils import ROOT
 
 try:
     import orjson as json  # noqa: F401
@@ -754,6 +758,10 @@ def __init__(
             for i in all_samples["splits"][split]
         ]
 
+        self.hofs = fetch_table("elements")["heat_of_formation"].values
+        self.hofs[np.isnan(self.hofs)] = self.hofs[~np.isnan(self.hofs)].mean()
+        self.hofs = torch.from_numpy(self.hofs).float()
+
         self.transform = transform
 
     def __len__(self):
@@ -785,6 +793,9 @@ def __getitem__(self, i):
         data.natoms = len(data.pos)
         data.tags = torch.full((data.natoms,), -1, dtype=torch.long)
         data.atomic_numbers = torch.tensor(data.atNUM, dtype=torch.long)
+        data.hofs = self.hofs[
+            data.atomic_numbers.numpy().astype(int) - 1  # element 1 is at row 0
+        ].sum()
 
         t1 = time.time_ns()
         if self.transform is not None:
@@ -809,12 +820,14 @@ def close_db(self):
 
 
 if __name__ == "__main__":
-    from ocpmodels.datasets.qm7x import QM7XFromLMDB as QM7X
+    import json
     from pathlib import Path
-    from tqdm import tqdm
+
     import numpy as np
-    import json
+    from tqdm import tqdm
+
     from ocpmodels.common.data_parallel import ParallelCollater
+    from ocpmodels.datasets.qm7x import QM7XFromLMDB as QM7X
 
     src = Path("/network/projects/ocp/qm7x/processed")
     smp = Path("configs/models/qm7x-metadata/samples.json")
diff --git a/ocpmodels/modules/normalizer.py b/ocpmodels/modules/normalizer.py
index 302f0d6cef..bbe169eee9 100644
--- a/ocpmodels/modules/normalizer.py
+++ b/ocpmodels/modules/normalizer.py
@@ -19,6 +19,8 @@ def __init__(self, tensor=None, mean=None, std=None, device=None):
         if device is None:
             device = "cpu"
 
+        self.device = device
+
         if tensor is not None:
             self.mean = torch.mean(tensor, dim=0).to(device)
             self.std = torch.std(tensor, dim=0).to(device)
@@ -28,19 +30,43 @@ def __init__(self, tensor=None, mean=None, std=None, device=None):
             self.mean = torch.tensor(mean).to(device)
             self.std = torch.tensor(std).to(device)
 
+        self.hof_mean = None
+        self.hof_std = None
+
     def to(self, device):
         self.mean = self.mean.to(device)
         self.std = self.std.to(device)
+        if self.hof_mean:
+            self.hof_mean = self.hof_mean.to(device)
+        if self.hof_std:
+            self.hof_std = self.hof_std.to(device)
+        self.device = device
 
-    def norm(self, tensor):
+    def norm(self, tensor, hofs=None):
+        if hofs is not None:
+            return tensor / hofs - self.hof_mean
         return (tensor - self.mean) / self.std
 
-    def denorm(self, normed_tensor):
+    def denorm(self, normed_tensor, hofs=None):
+        if hofs is not None:
+            return (normed_tensor + self.hof_mean) * hofs
         return normed_tensor * self.std + self.mean
 
     def state_dict(self):
-        return {"mean": self.mean, "std": self.std}
+        sd = {"mean": self.mean, "std": self.std}
+        if self.hof_rescales:
+            sd["hof_rescales"] = {
+                "mean": self.hof_mean,
+                "std": self.hof_std,
+            }
+        return sd
 
     def load_state_dict(self, state_dict):
         self.mean = state_dict["mean"].to(self.mean.device)
         self.std = state_dict["std"].to(self.mean.device)
+        if "hof_rescales" in state_dict:
+            self.set_hof_rescales(state_dict["hof_rescales"])
+
+    def set_hof_rescales(self, hof_rescales):
+        self.hof_mean = torch.tensor(hof_rescales["mean"], device=self.device)
+        self.hof_std = torch.tensor(hof_rescales["std"], device=self.device)
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 71ebca2585..507d8eefc9 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -320,6 +320,10 @@ def load_datasets(self):
                     std=self.normalizer["target_std"],
                     device=self.device,
                 )
+            if "hof_rescales" in self.normalizer:
+                self.normalizers["target"].set_hof_rescales(
+                    self.normalizer["hof_rescales"]
+                )
             else:
                 self.normalizers["target"] = Normalizer(
                     tensor=self.datasets["train"].data.y[
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index ef7fc5fff6..74c8217681 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -134,7 +134,14 @@ def predict(self, loader, per_image=True, results_file=None, disable_tqdm=False)
                 preds = self.model_forward(batch_list)
 
             if self.normalizers is not None and "target" in self.normalizers:
-                preds["energy"] = self.normalizers["target"].denorm(preds["energy"])
+                hofs = None
+                if self.task_name == "qm7x":
+                    hofs = torch.cat(
+                        [batch.hofs.to(self.device) for batch in batch_list], dim=0
+                    )
+                preds["energy"] = self.normalizers["target"].denorm(
+                    preds["energy"], hofs=hofs
+                )
             if self.normalizers is not None and "grad_target" in self.normalizers:
                 self.normalizers["grad_target"].to(self.device)
 
@@ -467,7 +474,12 @@ def compute_loss(self, preds, batch_list):
         )
 
         if self.normalizer.get("normalize_labels", False):
-            target_normed = self.normalizers["target"].norm(energy_target)
+            hofs = None
+            if self.task_name == "qm7x":
+                hofs = torch.cat(
+                    [batch.hofs.to(self.device) for batch in batch_list], dim=0
+                )
+            target_normed = self.normalizers["target"].norm(energy_target, hofs=hofs)
         else:
             target_normed = energy_target
         energy_mult = self.config["optim"].get("energy_coefficient", 1)
@@ -609,10 +621,19 @@ def compute_metrics(
                     )
 
         if self.normalizer.get("normalize_labels") and "target" in self.normalizers:
+            hofs = None
+            if self.task_name == "qm7x":
+                hofs = torch.cat(
+                    [batch.hofs.to(self.device) for batch in batch_list], dim=0
+                )
             if not self.config.get("no_metrics_denorm"):
-                preds["energy"] = self.normalizers["target"].denorm(preds["energy"])
+                preds["energy"] = self.normalizers["target"].denorm(
+                    preds["energy"], hofs=hofs
+                )
             else:
-                target["energy"] = self.normalizers["target"].norm(target["energy"])
+                target["energy"] = self.normalizers["target"].norm(
+                    target["energy"], hofs=hofs
+                )
 
         metrics = evaluator.eval(preds, target, prev_metrics=metrics)
 
diff --git a/scripts/compute_qm7x_rescales.py b/scripts/compute_qm7x_rescales.py
new file mode 100644
index 0000000000..b28c7a4c15
--- /dev/null
+++ b/scripts/compute_qm7x_rescales.py
@@ -0,0 +1,51 @@
+import json
+import os
+from pathlib import Path
+
+import numpy as np
+from mendeleev.fetch import fetch_table
+from tqdm import tqdm
+
+os.path.append(Path(__file__).resolve().parent.parent)
+
+from ocpmodels.common.utils import (
+    ROOT,
+    base_config,
+    move_lmdb_data_to_slurm_tmpdir,
+)
+from ocpmodels.trainers.single_trainer import SingleTrainer
+
+if __name__ == "__main__":
+    config = base_config("schnet-qm7x-all")
+    config["cp_data_to_tmpdir"] = True
+    config = move_lmdb_data_to_slurm_tmpdir(config)
+    trainer = SingleTrainer(**config)
+
+    df = fetch_table("elements")
+    HOF = df.set_index("atomic_number")["heat_of_formation"].values
+    non_nan_hof_mean = HOF[~np.isnan(HOF)].mean()
+    print("non_nan_hof_mean: ", non_nan_hof_mean)  # 353.3106853932584
+    HOF[np.isnan(HOF)] = non_nan_hof_mean
+
+    hofs = []
+
+    for batch_list in tqdm(trainer.loaders["train"]):
+        hofs += [
+            y / HOF[z.astype(int) - 1].sum()
+            for y, z in zip(batch_list[0].y, batch_list[0].atNUM)
+        ]
+
+    mean = np.mean(hofs)
+    std = np.std(hofs)
+
+    (ROOT / "configs" / "models" / "qm7x-metadata" / "hof_rescales.json").write_text(
+        json.dumps(
+            {
+                "mean": float(mean),
+                "std": float(std),
+                "about": "Statistics for y(=ePBE0+MBD) / sum(HOF) "
+                + "where HOF is the heat of formation of each element in the graph."
+                + " This is computed over the train set only.",
+            }
+        )
+    )

From b0deb50792570807a01a1f742901dbfd76e34d30 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:17:44 -0500
Subject: [PATCH 171/273] fix path

---
 configs/exps/qm7x/schnet-fanet.yaml | 97 +++++++++++++++--------------
 ocpmodels/common/exp_manager.py     |  3 +
 2 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml
index ece614e5f3..10c22056b1 100644
--- a/configs/exps/qm7x/schnet-fanet.yaml
+++ b/configs/exps/qm7x/schnet-fanet.yaml
@@ -52,6 +52,9 @@ default:
     num_interactions: 6
     cutoff: 5.0
     regress_forces: from_energy
+  dataset:
+    train:
+      rescale_with_hof: True
 
 runs:
   - config: schnet-qm7x-all
@@ -95,53 +98,53 @@ runs:
       lr_initial: 0.001
       batch_size: 256
 
-  # - config: fanet-qm7x-all
-  #   model:
-  #     graph_norm: true
-  #     edge_embed_type: all_rij
-  #     mp_type: updownscale_base
+  - config: fanet-qm7x-all
+    model:
+      graph_norm: true
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
 
-  # - config: fanet-qm7x-all
-  #   optim:
-  #     energy_coefficient: 0.01
-  #     energy_grad_coefficient: 0.1
-  #     force_coefficient: 0.89
-  #     lr_initial: 0.001
-  #     batch_size: 100
-  #   model:
-  #     graph_norm: true
-  #     edge_embed_type: all_rij
-  #     mp_type: updownscale_base
-  #     force_decoder_type: mlp
-  #     regress_forces: direct_with_gradient_target
+  - config: fanet-qm7x-all
+    optim:
+      energy_coefficient: 0.01
+      energy_grad_coefficient: 0.1
+      force_coefficient: 0.89
+      lr_initial: 0.001
+      batch_size: 100
+    model:
+      graph_norm: true
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
+      force_decoder_type: mlp
+      regress_forces: direct_with_gradient_target
 
-  # - config: fanet-qm7x-all
-  #   optim:
-  #     energy_coefficient: 0.01
-  #     energy_grad_coefficient: 0.1
-  #     force_coefficient: 0.89
-  #     lr_initial: 0.001
-  #     batch_size: 100
-  #   model:
-  #     graph_norm: false
-  #     force_decoder_type: mlp
-  #     edge_embed_type: all_rij
-  #     regress_forces: direct_with_gradient_target
-  #     num_interactions: 4
+  - config: fanet-qm7x-all
+    optim:
+      energy_coefficient: 0.01
+      energy_grad_coefficient: 0.1
+      force_coefficient: 0.89
+      lr_initial: 0.001
+      batch_size: 100
+    model:
+      graph_norm: false
+      force_decoder_type: mlp
+      edge_embed_type: all_rij
+      regress_forces: direct_with_gradient_target
+      num_interactions: 4
 
-  # - config: fanet-qm7x-all
-  #   optim:
-  #     energy_coefficient: 0.01
-  #     energy_grad_coefficient: 0.1
-  #     force_coefficient: 0.89
-  #     lr_initial: 0.001
-  #     batch_size: 100
-  #   model:
-  #     graph_norm: true
-  #     force_decoder_type: mlp
-  #     mp_type: updownscale_base
-  #     edge_embed_type: all_rij
-  #     regress_forces: direct_with_gradient_target
-  #     num_interactions: 3
-  #     num_filters: 256
-  #     hidden_channels: 256
+  - config: fanet-qm7x-all
+    optim:
+      energy_coefficient: 0.01
+      energy_grad_coefficient: 0.1
+      force_coefficient: 0.89
+      lr_initial: 0.001
+      batch_size: 100
+    model:
+      graph_norm: true
+      force_decoder_type: mlp
+      mp_type: updownscale_base
+      edge_embed_type: all_rij
+      regress_forces: direct_with_gradient_target
+      num_interactions: 3
+      num_filters: 256
+      hidden_channels: 256
diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index b90e473742..37370c8c69 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -10,6 +10,9 @@
 from datetime import datetime
 import yaml
 from tqdm import tqdm
+
+os.path.append(Path(__file__).resolve().parent.parent.parent)
+
 from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path
 
 EXP_OUT_DIR = ROOT / "data" / "exp_outputs"

From 3ce82f4767fc1ca5f830bc137215e5a9e8f5f295 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:18:47 -0500
Subject: [PATCH 172/273] sys not os

---
 ocpmodels/common/exp_manager.py  | 2 +-
 scripts/compute_qm7x_rescales.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 37370c8c69..4ce5f560bc 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -11,7 +11,7 @@
 import yaml
 from tqdm import tqdm
 
-os.path.append(Path(__file__).resolve().parent.parent.parent)
+sys.path.append(Path(__file__).resolve().parent.parent.parent)
 
 from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path
 
diff --git a/scripts/compute_qm7x_rescales.py b/scripts/compute_qm7x_rescales.py
index b28c7a4c15..7713d360f9 100644
--- a/scripts/compute_qm7x_rescales.py
+++ b/scripts/compute_qm7x_rescales.py
@@ -1,12 +1,12 @@
 import json
-import os
+import sys
 from pathlib import Path
 
 import numpy as np
 from mendeleev.fetch import fetch_table
 from tqdm import tqdm
 
-os.path.append(Path(__file__).resolve().parent.parent)
+sys.path.append(Path(__file__).resolve().parent.parent)
 
 from ocpmodels.common.utils import (
     ROOT,

From f266f8b825b20b7ce7d57b307534f232da01fa6e Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:20:53 -0500
Subject: [PATCH 173/273] str paths

---
 ocpmodels/common/exp_manager.py  | 2 +-
 scripts/compute_qm7x_rescales.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 4ce5f560bc..a67a48f24f 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -11,7 +11,7 @@
 import yaml
 from tqdm import tqdm
 
-sys.path.append(Path(__file__).resolve().parent.parent.parent)
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 
 from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path
 
diff --git a/scripts/compute_qm7x_rescales.py b/scripts/compute_qm7x_rescales.py
index 7713d360f9..7e91bd53ae 100644
--- a/scripts/compute_qm7x_rescales.py
+++ b/scripts/compute_qm7x_rescales.py
@@ -6,7 +6,7 @@
 from mendeleev.fetch import fetch_table
 from tqdm import tqdm
 
-sys.path.append(Path(__file__).resolve().parent.parent)
+sys.path.append(str(Path(__file__).resolve().parent.parent))
 
 from ocpmodels.common.utils import (
     ROOT,

From bcc2e0eddeaaa32a2f24c3d47041f91bc706fd3c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:23:11 -0500
Subject: [PATCH 174/273] finished first in exp manager

---
 ocpmodels/common/exp_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index a67a48f24f..16e19167bd 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -202,7 +202,9 @@ def parse_output_files(self):
                 continue
 
             out_txt = out_file.read_text()
-            if "RaceCondition" in out_txt:
+            if "eval_all_splits" in out_txt and "Final results" in out_txt:
+                self.cache["job_state"][j] = "Finished"
+            elif "RaceCondition" in out_txt:
                 self.cache["job_state"][j] = "RaceCondition"
             elif "Traceback" in out_txt:
                 self.cache["job_state"][j] = (
@@ -211,8 +213,6 @@ def parse_output_files(self):
             elif "srun: Job step aborted" in out_txt:
                 if "slurmstepd" in out_txt and " CANCELLED AT " in out_txt:
                     self.cache["job_state"][j] = "Cancelled"
-            elif "eval_all_splits" in out_txt and "Final results" in out_txt:
-                self.cache["job_state"][j] = "Finished"
             elif "nan_loss" in out_txt:
                 self.cache["job_state"][j] = "NaN loss"
             else:

From 137815426d6c68346907c3d8b673b2ae007a08fc Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:25:03 -0500
Subject: [PATCH 175/273] 800 warmup epochs

---
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index 88e871f0a4..ed94f4fddc 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -21,7 +21,7 @@ default:
     # early stopping
     es_patience: 20
     es_min_abs_change: 0.000001
-    es_warmup_epochs: 500
+    es_warmup_epochs: 800
     # all below is for the scheduler
     scheduler: ReduceLROnPlateau
     mode: min

From 37ae180f4611203b368797b7af1df58f9250ff43 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:28:34 -0500
Subject: [PATCH 176/273] parse running/waiting jobs

---
 ocpmodels/common/exp_manager.py | 46 +++++++++++++++++++--------------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 16e19167bd..e760c090b1 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -79,6 +79,23 @@ def __init__(
         self.job_ids = sorted(
             [p.name for runs in self.trial_hparams_to_rundirs.values() for p in runs]
         )
+        sq_cmd = (
+            "/opt/slurm/bin/squeue"
+            if "CC_CLUSTER" not in os.environ
+            else "/opt/software/slurm/bin/squeue"
+        )
+        sq = set(
+            [
+                j.strip()
+                for j in os.popen(f"{sq_cmd} -u $USER -o '%12i'")
+                .read()
+                .splitlines()[1:]
+            ]
+        )
+        self.running_jobs = set(self.job_ids) & sq
+        self.waiting_jobs = (
+            set([j.parent.name for j in RUN_DIR.glob(f"*/{self.name}.exp")]) & sq
+        ) - self.running_jobs
         print("\n")
         self.discover_yamls()
         self.discover_job_ids_from_yaml()
@@ -127,33 +144,17 @@ def print_status(self):
         )
         print("{:32} : {:4}".format("Existing wandb runs", len(self.wandb_runs)))
         print("{:32} : {}".format("Algorithm's budgets", str(self.budgets)))
-        sq_cmd = (
-            "/opt/slurm/bin/squeue"
-            if "CC_CLUSTER" not in os.environ
-            else "/opt/software/slurm/bin/squeue"
-        )
-        sq = set(
-            [
-                j.strip()
-                for j in os.popen(f"{sq_cmd} -u $USER -o '%12i'")
-                .read()
-                .splitlines()[1:]
-            ]
-        )
-        running = set(self.job_ids) & sq
-        waiting = (
-            set([j.parent.name for j in RUN_DIR.glob(f"*/{self.name}.exp")]) & sq
-        ) - running
+
         print(
             "{:32} : {}".format(
                 "Jobs currently running:",
-                f"{len(running)} " + " ".join(sorted(running)),
+                f"{len(self.running_jobs)} " + " ".join(sorted(self.running_jobs)),
             )
         )
         print(
             "{:32} : {}".format(
                 "Jobs currently waiting:",
-                f"{len(waiting)} " + " ".join(sorted(waiting)),
+                f"{len(self.waiting_jobs)} " + " ".join(sorted(self.waiting_jobs)),
             )
         )
 
@@ -216,7 +217,12 @@ def parse_output_files(self):
             elif "nan_loss" in out_txt:
                 self.cache["job_state"][j] = "NaN loss"
             else:
-                self.cache["job_state"][j] = "Unknown"
+                if j in self.waiting_jobs:
+                    self.cache["job_state"][j] = "Waiting"
+                if j in self.running_jobs:
+                    self.cache["job_state"][j] = "Running"
+                else:
+                    self.cache["job_state"][j] = "Unknown"
         self.commit_cache()
 
     def print_output_files_stats(self):

From 92dfa9a7b545c1802caf5558ada1cf4d1699d46f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:29:21 -0500
Subject: [PATCH 177/273] parse running/waiting jobs

---
 ocpmodels/common/exp_manager.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index e760c090b1..fae22c99c3 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -196,6 +196,12 @@ def parse_output_files(self):
         for j in tqdm(self.cache["all_job_ids"], desc="Parsing output files"):
             if j in self.cache["job_state"] and not self.rebuild_cache:
                 continue
+            if j in self.waiting_jobs:
+                self.cache["job_state"][j] = "Waiting"
+                continue
+            if j in self.running_jobs:
+                self.cache["job_state"][j] = "Running"
+                continue
             out_file = RUN_DIR / j / "output-0.txt"
 
             if not out_file.exists():
@@ -217,12 +223,7 @@ def parse_output_files(self):
             elif "nan_loss" in out_txt:
                 self.cache["job_state"][j] = "NaN loss"
             else:
-                if j in self.waiting_jobs:
-                    self.cache["job_state"][j] = "Waiting"
-                if j in self.running_jobs:
-                    self.cache["job_state"][j] = "Running"
-                else:
-                    self.cache["job_state"][j] = "Unknown"
+                self.cache["job_state"][j] = "Unknown"
         self.commit_cache()
 
     def print_output_files_stats(self):

From 7a496ea0e8a2689fe1003207caad1728d476b8cb Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:31:50 -0500
Subject: [PATCH 178/273] parse time limit

---
 ocpmodels/common/exp_manager.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index fae22c99c3..7a46e9166d 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -211,6 +211,8 @@ def parse_output_files(self):
             out_txt = out_file.read_text()
             if "eval_all_splits" in out_txt and "Final results" in out_txt:
                 self.cache["job_state"][j] = "Finished"
+            elif "DUE TO TIME LIMIT" in out_txt:
+                self.cache["job_state"][j] = "TimeLimit"
             elif "RaceCondition" in out_txt:
                 self.cache["job_state"][j] = "RaceCondition"
             elif "Traceback" in out_txt:

From 61114adcf236534287a5fec047b2ed78105891a8 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:34:06 -0500
Subject: [PATCH 179/273] improve ES

---
 configs/exps/icml/qm9/fanet-orion-qm9.yaml | 4 ++--
 ocpmodels/modules/scheduler.py             | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9.yaml b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
index ed94f4fddc..bfcf94caa6 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9.yaml
@@ -25,10 +25,10 @@ default:
     # all below is for the scheduler
     scheduler: ReduceLROnPlateau
     mode: min
-    factor: 0.5
+    factor: 0.75
     threshold: 0.0001
     threshold_mode: abs
-    min_lr: 0.00001
+    min_lr: 0.000001
     verbose: true
   note:
     model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index 7c0a5fc071..ca440b1854 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -178,15 +178,16 @@ def should_stop(self, metric, lr=None, epoch=None):
             else:
                 self.counter += 1
 
-        if self.warmup_epochs > 0 and epoch is not None and epoch < self.warmup_epochs:
-            self.counter = 0
-
         if self.counter >= self.patience:
             self.early_stop = "metric"
 
         if lr is not None and lr <= self.min_lr:
             self.early_stop = "lr"
 
+        if self.warmup_epochs > 0 and epoch is not None and epoch < self.warmup_epochs:
+            self.early_stop = ""
+            self.counter = 0
+
         return self.early_stop
 
     @property

From d341351438e535494f303b076dd4391a073e07be Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:35:28 -0500
Subject: [PATCH 180/273] parse DatabaseTimeout

---
 ocpmodels/common/exp_manager.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 7a46e9166d..5c072467b6 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -215,6 +215,8 @@ def parse_output_files(self):
                 self.cache["job_state"][j] = "TimeLimit"
             elif "RaceCondition" in out_txt:
                 self.cache["job_state"][j] = "RaceCondition"
+            elif "DatabaseTimeout: Could not acquire lock for PickledDB" in out_txt:
+                self.cache["job_state"][j] = "DatabaseTimeout"
             elif "Traceback" in out_txt:
                 self.cache["job_state"][j] = (
                     "Traceback: " + out_txt.split("Traceback")[1]

From 27b0eb0f0329a1c6b70bffc5cc2e3c931a9ad646 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:36:49 -0500
Subject: [PATCH 181/273] parse WaitingForTrials

---
 ocpmodels/common/exp_manager.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 5c072467b6..85916787ee 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -217,6 +217,11 @@ def parse_output_files(self):
                 self.cache["job_state"][j] = "RaceCondition"
             elif "DatabaseTimeout: Could not acquire lock for PickledDB" in out_txt:
                 self.cache["job_state"][j] = "DatabaseTimeout"
+            elif (
+                "Algo does not have more trials to sample.Waiting for current trials to finish"  # noqa: E501
+                in out_txt
+            ):
+                self.cache["job_state"][j] = "WaitingForTrials"
             elif "Traceback" in out_txt:
                 self.cache["job_state"][j] = (
                     "Traceback: " + out_txt.split("Traceback")[1]

From 43da4ee65c266f185541212c32f1bf649927ff59 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 20:39:19 -0500
Subject: [PATCH 182/273] cleaner prints

---
 ocpmodels/common/exp_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 85916787ee..6fae71cf91 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -251,7 +251,7 @@ def print_output_files_stats(self):
                 stats[o]["n"] += 1
                 stats[o]["ids"].append(j)
         for s, v in stats.items():
-            print(f"• {s:31}" + f": {v['n']} (" + " ".join(v["ids"]) + ")")
+            print(f"\n• {s:31}" + f": {v['n']}\n    " + " ".join(v["ids"]))
         if stats["Traceback"]["n"] > 0 and self.print_tracebacks:
             print("\nTraceback contents:\n" + "-" * 19 + "\n")
             print(

From 2de50ce7f40e787c0fc0f47cd4b0500e6961cade Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 22:44:26 -0500
Subject: [PATCH 183/273] typo in normalizer state dict

---
 ocpmodels/modules/normalizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/modules/normalizer.py b/ocpmodels/modules/normalizer.py
index bbe169eee9..df2830e276 100644
--- a/ocpmodels/modules/normalizer.py
+++ b/ocpmodels/modules/normalizer.py
@@ -54,7 +54,7 @@ def denorm(self, normed_tensor, hofs=None):
 
     def state_dict(self):
         sd = {"mean": self.mean, "std": self.std}
-        if self.hof_rescales:
+        if self.hof_mean is not None:
             sd["hof_rescales"] = {
                 "mean": self.hof_mean,
                 "std": self.hof_std,

From 64da2d0bd8609178c5484b37d89fd9d8b23ab49c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 23:02:53 -0500
Subject: [PATCH 184/273] enable restart from dir

---
 ocpmodels/common/flags.py          |  6 ++++
 ocpmodels/common/utils.py          | 49 ++++++++++++++++++++++--------
 ocpmodels/trainers/base_trainer.py |  8 ++---
 3 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index f7da16d626..2a8caa3e91 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -89,6 +89,12 @@ def add_core_args(self):
         self.parser.add_argument(
             "--continue_from_dir", type=str, help="Run to continue, loading its config"
         )
+        self.parser.add_argument(
+            "--restart_from_dir",
+            type=str,
+            help="Run to restart, loading its config and overwriting "
+            + "from the command-line",
+        )
         self.parser.add_argument(
             "--timestamp-id",
             default=None,
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index b7e337843f..f0de988127 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -282,7 +282,7 @@ def read_slurm_env(config):
         return config
 
 
-def continue_from_slurm_job_id(config):
+def continue_from_slurm_job_id(config, from_best=False):
     """
     Assuming runs are consistently executed in a `run_dir` with the
     `run_dir/$SLURM_JOBID` pattern, this functions looks for an existing
@@ -298,6 +298,8 @@ def continue_from_slurm_job_id(config):
 
     Args:
         config (dict): The original config to overwrite
+        from_best (bool, optional): If True, only looks for `best_checkpoint.pt`.
+            otherwise, looks for the latest checkpoint. Defaults to False.
 
     Returns:
         dict: The updated config if a checkpoint has been found
@@ -314,9 +316,12 @@ def continue_from_slurm_job_id(config):
     if not ckpt_dir.exists() or not ckpt_dir.is_dir():
         return config
 
-    best_ckp = ckpt_dir / "best_checkpoint.pt"
-    if best_ckp.exists():
-        config["checkpoint"] = str(best_ckp)
+    if from_best:
+        best_ckp = ckpt_dir / "best_checkpoint.pt"
+        if best_ckp.exists():
+            ckpt = str(best_ckp)
+        else:
+            raise FileNotFoundError(f"No best checkpoint found in {str(ckpt_dir)}")
     else:
         ckpts = list(ckpt_dir.glob("checkpoint-*.pt"))
         if not ckpts:
@@ -325,7 +330,11 @@ def continue_from_slurm_job_id(config):
             ckpts, key=lambda f: float(f.stem.split("checkpoint-")[-1])
         )[-1]
         if latest_ckpt.exists() and latest_ckpt.is_file():
-            config["checkpoint"] = str(latest_ckpt)
+            ckpt = str(latest_ckpt)
+
+    if ckpt:
+        config["checkpoint"] = ckpt
+        print(f"\n🎁 Resuming based on $SLURM_JOB_ID {JOB_ID} from {ckpt}\n")
 
     return config
 
@@ -1108,8 +1117,16 @@ def build_config(args, args_override):
     if args_override != []:
         overrides = create_dict_from_args(args_override)
 
-    if args.continue_from_dir:
-        cont_dir = resolve(args.continue_from_dir)
+    if args.continue_from_dir or args.restart_from_dir:
+        if args.continue_from_dir and args.restart_from_dir:
+            raise ValueError(
+                "Cannot specify both --continue_from_dir and --restart_from_dir."
+            )
+        cont_dir = (
+            resolve(args.continue_from_dir)
+            if args.continue_from_dir
+            else resolve(args.restart_from_dir)
+        )
         ckpts = list(cont_dir.glob("checkpoints/checkpoint-*.pt"))
         if not ckpts:
             print(
@@ -1120,9 +1137,15 @@ def build_config(args, args_override):
             latest_ckpt = str(
                 sorted(ckpts, key=lambda c: float(c.stem.split("-")[-1]))[-1]
             )
-            continue_config["checkpoint"] = str(latest_ckpt)
+            if args.continue_from_dir:
+                continue_config["checkpoint"] = str(latest_ckpt)
             continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
-            print("✅ Loading config from cont dir and latest checkpoint:", latest_ckpt)
+            print(
+                f"✅ Loading config from directory {str(cont_dir)}"
+                + f" and latest checkpoint: {latest_ckpt}"
+                if args.continue_from_dir
+                else ""
+            )
             args.config = continue_config["config"]
 
     config = load_config(args.config)
@@ -1137,13 +1160,13 @@ def build_config(args, args_override):
     config["world_size"] = args.num_nodes * args.num_gpus
 
     if continue_config:
-        dirs_k_v = [(k, v) for k, v in config.items() if "dir" in k]
-        dataset_config = copy.deepcopy(config["dataset"])
+        new_dirs = [(k, v) for k, v in config.items() if "dir" in k]
+        # dataset_config = copy.deepcopy(config["dataset"])
         config = merge_dicts(
             continue_config,
-            {k: resolve(v) if isinstance(v, str) else v for k, v in dirs_k_v},
+            {k: resolve(v) if isinstance(v, str) else v for k, v in new_dirs},
         )
-        config["dataset"] = dataset_config
+        # config["dataset"] = dataset_config
         config = merge_dicts(config, cli_args_dict())
         config = merge_dicts(config, overrides)
 
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 507d8eefc9..c22ef01139 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -320,10 +320,10 @@ def load_datasets(self):
                     std=self.normalizer["target_std"],
                     device=self.device,
                 )
-            if "hof_rescales" in self.normalizer:
-                self.normalizers["target"].set_hof_rescales(
-                    self.normalizer["hof_rescales"]
-                )
+                if "hof_rescales" in self.normalizer:
+                    self.normalizers["target"].set_hof_rescales(
+                        self.normalizer["hof_rescales"]
+                    )
             else:
                 self.normalizers["target"] = Normalizer(
                     tensor=self.datasets["train"].data.y[

From ef0715db083ab754147830a98035a9996a48b5a3 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 23:14:00 -0500
Subject: [PATCH 185/273] raise value error in case of missing ocnfig arg

---
 ocpmodels/common/utils.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index f0de988127..2c201644f0 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1142,12 +1142,19 @@ def build_config(args, args_override):
             continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
             print(
                 f"✅ Loading config from directory {str(cont_dir)}"
-                + f" and latest checkpoint: {latest_ckpt}"
-                if args.continue_from_dir
-                else ""
+                + (
+                    f" and latest checkpoint: {latest_ckpt}"
+                    if args.continue_from_dir
+                    else " (restarting from scratch)"
+                )
             )
             args.config = continue_config["config"]
 
+    if args.config is None:
+        raise ValueError(
+            "Must specify a config file with " + f"--config. Received args: {args}"
+        )
+
     config = load_config(args.config)
     config = merge_dicts(config, args_dict_with_defaults)
     config = merge_dicts(config, overrides)

From d4d00a6569e6ea8923bc5be8e01e62a1e4aafc94 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 23:34:27 -0500
Subject: [PATCH 186/273] qm9 fanet v4

---
 configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml
new file mode 100644
index 0000000000..92774ccd0f
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml
@@ -0,0 +1,68 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 8GB
+  cpus: 4
+  gres: gpu:1
+  time: 02:50:00
+  partition: long
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, orion
+  log_train_every: 100
+  optim:
+    batch_size: 32
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mse
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.000001
+    es_warmup_epochs: 800
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: num_filters, pg_hidden_channels, phys_hidden_channels, num_gaussians
+  frame_averaging: 3D
+  fa_frames: random
+  model:
+    edge_embed_type: all_rij
+    energy_head: ""
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 20
+
+  unique_exp_name: fanet-qm9-v4.0.0
+
+  space:
+    optim/max_epochs: fidelity(200, 2000, base=5)
+    optim/lr_initial: loguniform(1e-4, 6e-4, precision=3)
+    model/graph_norm: choices([True, False])
+    model/hidden_channels: uniform(5, 15, discrete=True)
+    model/mp_type: choices(["simple", "base", "updownscale", "updownscale_base"])
+    model/num_filters: uniform(4, 16, discrete=True)
+    model/num_gaussians: uniform(1, 4, discrete=True)
+    model/num_interactions: uniform(3, 5, discrete=True)
+    model/pg_hidden_channels: uniform(0, 1, discrete=True)
+    model/phys_embeds: choices([True, False])
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2

From b752916a2d30a387a3982a8d29a27b91b7e8b4ce Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 23:46:18 -0500
Subject: [PATCH 187/273] min hidden_channels

---
 configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml
index 92774ccd0f..b43d252a8e 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v4.yaml
@@ -48,13 +48,13 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 20
 
-  unique_exp_name: fanet-qm9-v4.0.0
+  unique_exp_name: fanet-qm9-v4.0.1
 
   space:
     optim/max_epochs: fidelity(200, 2000, base=5)
     optim/lr_initial: loguniform(1e-4, 6e-4, precision=3)
     model/graph_norm: choices([True, False])
-    model/hidden_channels: uniform(5, 15, discrete=True)
+    model/hidden_channels: uniform(6, 15, discrete=True)
     model/mp_type: choices(["simple", "base", "updownscale", "updownscale_base"])
     model/num_filters: uniform(4, 16, discrete=True)
     model/num_gaussians: uniform(1, 4, discrete=True)

From 585b5bee94f49ecea04020b245899135c23bc5aa Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 18 Jan 2023 23:52:42 -0500
Subject: [PATCH 188/273] report 1e12 for error in trainer init

---
 main.py                         | 40 +++++++++++++++++++++++----------
 ocpmodels/common/exp_manager.py |  5 +++++
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/main.py b/main.py
index dc763ce3de..ea6aa210ac 100644
--- a/main.py
+++ b/main.py
@@ -8,10 +8,8 @@
 import copy
 import logging
 import os
-import shutil
 import time
 import traceback
-import warnings
 
 import torch
 from orion.core.utils.exceptions import ReservationRaceCondition
@@ -65,7 +63,7 @@ def __init__(self, trainer_config):
         self.hparams = {}
 
     def run(self, orion_exp=None):
-        orion_trial = None
+        orion_trial = signal = None
         self.original_config = copy.deepcopy(self.trainer_config)
         orion_race_condition = False
         if dist_utils.is_master():
@@ -110,31 +108,49 @@ def run(self, orion_exp=None):
         self.trainer_config = continue_orion_exp(self.trainer_config)
         self.trainer_config = auto_note(self.trainer_config)
         cls = registry.get_trainer_class(self.trainer_config["trainer"])
-        self.trainer: BaseTrainer = cls(**self.trainer_config)
-        task = registry.get_task_class(self.trainer_config["mode"])(self.trainer_config)
-        task.setup(self.trainer)
-        start_time = time.time()
-        print_warnings()
+        try:
+            self.trainer: BaseTrainer = cls(**self.trainer_config)
+        except Exception as e:
+            print(f"Error in trainer initialization: {e}")
+            traceback.print_exc()
+            signal = "trainer_init_error"
+
+        if signal is None:
+            task = registry.get_task_class(self.trainer_config["mode"])(
+                self.trainer_config
+            )
+            task.setup(self.trainer)
+            start_time = time.time()
+            print_warnings()
 
-        signal = task.run()
+            signal = task.run()
 
         # handle job preemption / time limit
         if signal == "SIGTERM":
             print("\nJob was preempted. Wrapping up...\n")
-            self.trainer.close_datasets()
+            if self.trainer:
+                self.trainer.close_datasets()
 
         dist_utils.synchronize()
         logging.info(f"Total time taken: {time.time() - start_time}")
-        if self.trainer.logger is not None:
+        if self.trainer and self.trainer.logger is not None:
             self.trainer.logger.log({"Total time": time.time() - start_time})
 
-        objective = dist_utils.broadcast_from_master(self.trainer.objective)
+        objective = dist_utils.broadcast_from_master(
+            self.trainer.objective if self.trainer else None
+        )
 
         if orion_exp is not None:
             if objective is None:
                 if signal == "loss_is_nan":
                     objective = 1e12
                     print("Received NaN objective from worker. Setting to 1e12.")
+                if signal == "trainer_init_error":
+                    objective = 1e12
+                    print(
+                        "Received trainer_init_error from worker.",
+                        "Setting objective to 1e12.",
+                    )
                 else:
                     print("Received None objective from worker. Skipping observation.")
             if objective is not None:
diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 6fae71cf91..6a2bead459 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -222,6 +222,11 @@ def parse_output_files(self):
                 in out_txt
             ):
                 self.cache["job_state"][j] = "WaitingForTrials"
+            elif (
+                "RuntimeError: Trying to create tensor with negative dimension"
+                in out_txt
+            ):
+                self.cache["job_state"][j] = "NegativeEmbeddingDimension"
             elif "Traceback" in out_txt:
                 self.cache["job_state"][j] = (
                     "Traceback: " + out_txt.split("Traceback")[1]

From 04487ec37b5eb86f0eca5151dfa041c88325e17a Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 00:00:39 -0500
Subject: [PATCH 189/273] fix unbounded var start_time

---
 main.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index ea6aa210ac..ac749e1d2e 100644
--- a/main.py
+++ b/main.py
@@ -115,12 +115,12 @@ def run(self, orion_exp=None):
             traceback.print_exc()
             signal = "trainer_init_error"
 
+        start_time = time.time()
         if signal is None:
             task = registry.get_task_class(self.trainer_config["mode"])(
                 self.trainer_config
             )
             task.setup(self.trainer)
-            start_time = time.time()
             print_warnings()
 
             signal = task.run()
@@ -132,9 +132,10 @@ def run(self, orion_exp=None):
                 self.trainer.close_datasets()
 
         dist_utils.synchronize()
-        logging.info(f"Total time taken: {time.time() - start_time}")
+        total_time = time.time() - start_time
+        logging.info(f"Total time taken: {total_time}")
         if self.trainer and self.trainer.logger is not None:
-            self.trainer.logger.log({"Total time": time.time() - start_time})
+            self.trainer.logger.log({"Total time": total_time})
 
         objective = dist_utils.broadcast_from_master(
             self.trainer.objective if self.trainer else None

From 6607071d9bd2c1e7a9d4f7caa765621527e66ebe Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Thu, 19 Jan 2023 03:44:13 -0500
Subject: [PATCH 190/273] orion and top config

---
 .../exps/icml/is2re-all/fanet-orion-2.yaml    |   2 +-
 .../exps/icml/is2re-all/fanet-orion-3.yaml    |   8 +-
 .../exps/icml/is2re-all/fanet-orion-4.yaml    |  60 ++++
 configs/exps/icml/is2re-all/top-config-2.yaml | 263 ++++++++++++++++
 configs/exps/icml/is2re-all/top-config.yaml   | 286 ++++++++++++++++--
 scripts/gnn_dev.py                            |   3 +-
 6 files changed, 589 insertions(+), 33 deletions(-)
 create mode 100644 configs/exps/icml/is2re-all/fanet-orion-4.yaml
 create mode 100644 configs/exps/icml/is2re-all/top-config-2.yaml

diff --git a/configs/exps/icml/is2re-all/fanet-orion-2.yaml b/configs/exps/icml/is2re-all/fanet-orion-2.yaml
index cf88591af6..a41f774433 100644
--- a/configs/exps/icml/is2re-all/fanet-orion-2.yaml
+++ b/configs/exps/icml/is2re-all/fanet-orion-2.yaml
@@ -32,7 +32,7 @@ default:
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
-  n_jobs: 72
+  n_jobs: 12
 
   unique_exp_name: fanet-is2re-all-v2
 
diff --git a/configs/exps/icml/is2re-all/fanet-orion-3.yaml b/configs/exps/icml/is2re-all/fanet-orion-3.yaml
index 8daecd138c..f57167d004 100644
--- a/configs/exps/icml/is2re-all/fanet-orion-3.yaml
+++ b/configs/exps/icml/is2re-all/fanet-orion-3.yaml
@@ -3,7 +3,7 @@ job:
   mem: 32GB
   cpus: 4
   gres: gpu:rtx8000:1
-  time: 14:00:00
+  time: 12:00:00
   partition: long
 
 default:
@@ -30,9 +30,9 @@ default:
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
-  n_jobs: 72
+  n_jobs: 216
 
-  unique_exp_name: fanet-is2re-all-v2
+  unique_exp_name: fanet-is2re-all-v3
 
   space:
     model/complex_mp: choices([True, False])
@@ -49,7 +49,7 @@ orion:
     model/pg_hidden_channels: uniform(1, 2, discrete=True)
     model/phys_embeds: choices([True, False])
     model/second_layer_MLP: choices([True, False])
-    model/skip_co: choices(["concat", False])
+    model/skip_co: choices(["concat", False, "concat-atom"])
     model/tag_hidden_channels: uniform(1, 2, discrete=True)
     optim/lr_initial: loguniform(9e-4, 5e-3, precision=2)
     optim/max_epochs: fidelity(7, 15, base=6)
diff --git a/configs/exps/icml/is2re-all/fanet-orion-4.yaml b/configs/exps/icml/is2re-all/fanet-orion-4.yaml
new file mode 100644
index 0000000000..f86ea559f4
--- /dev/null
+++ b/configs/exps/icml/is2re-all/fanet-orion-4.yaml
@@ -0,0 +1,60 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  time: 12:00:00
+  partition: long
+
+default:
+  wandb_project: ocp-3
+  config: fanet-is2re-all
+  mode: train
+  test_ri: True
+  wandb_tags: is2re-all, orion-3
+  cp_data_to_tmpdir: true
+  graph_rewiring: remove-tag-0
+  model:
+    graph_norm: True
+  frame_averaging: 2D
+  optim:
+    scheduler: LinearWarmupCosineAnnealingLR
+    eval_every: 0.5
+    lr_initial: 0.002
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co
+    optim: lr_initial, warmup_steps
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 150
+
+  unique_exp_name: fanet-is2re-all-v4
+
+  space:
+    model/complex_mp: choices([True, False])
+    model/cutoff: choices([4.0, 6.0])
+    model/edge_embed_type: choices(["all_rij"])
+    model/energy_head: choices(["weighted-av-final-embeds"])
+    model/fa_frames: choices(["random", "se3-random"])
+    model/hidden_channels: uniform(9, 17, discrete=True)
+    model/max_num_neighbors: choices([30, 40, 50])
+    model/mp_type: choices(["base", "updownscale", "updownscale_base", "updown_local_env"])
+    model/num_filters: uniform(4, 15, discrete=True)
+    model/num_gaussians: uniform(40, 140, discrete=True)
+    model/num_interactions: uniform(4, 8, discrete=True)
+    model/pg_hidden_channels: uniform(1, 3, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/second_layer_MLP: choices([True, False])
+    model/skip_co: choices(["concat", False, "concat-atom"])
+    model/tag_hidden_channels: uniform(1, 2, discrete=True)
+    optim/max_epochs: fidelity(6, 12, base=6)
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 1
diff --git a/configs/exps/icml/is2re-all/top-config-2.yaml b/configs/exps/icml/is2re-all/top-config-2.yaml
new file mode 100644
index 0000000000..00bf54b35b
--- /dev/null
+++ b/configs/exps/icml/is2re-all/top-config-2.yaml
@@ -0,0 +1,263 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 15:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    edge_embed_type: all_rij
+  wandb_tags: 'best-config'
+  optim:
+    batch_size: 256
+    eval_batch_size: 256
+    max_epochs: 9
+  cp_data_to_tmpdir: true
+
+runs:
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-02-modif'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 300
+      num_filters: 300
+      num_gaussians: 70
+      num_interactions: 6
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0022
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 8
+      eval_every: 0.4
+
+  - config: fanet-is2re-all
+    note: 'top-1-modif'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: updownscale
+      phys_embeds: False
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: False
+      graph_norm: true
+      hidden_channels: 352
+      num_filters: 448
+      num_gaussians: 99
+      num_interactions: 8
+      second_layer_MLP: True
+      skip_co: concat
+    optim: 
+      lr_initial: 0.0019
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-1-modif'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale
+      phys_embeds: True
+      tag_hidden_channels: 64
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: False
+      graph_norm: True
+      hidden_channels: 375
+      num_filters: 448
+      num_gaussians: 110
+      num_interactions: 6
+      skip_co: concat
+    optim: 
+      lr_initial: 0.0025
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-2-modif'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: updown_local_env
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: False
+      graph_norm: True
+      hidden_channels: 192
+      num_filters: 480
+      num_gaussians: 98
+      num_interactions: 5
+      second_layer_MLP: True
+      skip_co: add
+    optim: 
+      lr_initial: 0.0027
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-3'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: updown_local_env
+      phys_embeds: True
+      tag_hidden_channels: 64
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 288
+      num_filters: 480
+      num_gaussians: 45
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: False
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.003
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-3-modif'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: updown_local_env
+      phys_embeds: True
+      tag_hidden_channels: 64
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 288
+      num_filters: 480
+      num_gaussians: 90
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.003
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-4'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: base
+      phys_embeds: False
+      tag_hidden_channels: 0
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 384
+      num_filters: 384
+      num_gaussians: 77
+      num_interactions: 4
+      second_layer_MLP: False
+      skip_co: False
+      cutoff: 10.0
+    optim: 
+      lr_initial: 0.0025
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-4-modif'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 384
+      num_filters: 384
+      num_gaussians: 90
+      num_interactions: 4
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0025
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-5'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 64
+      pg_hidden_channels: 32
+      energy_head: False
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 320
+      num_filters: 416
+      num_gaussians: 36
+      num_interactions: 6
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.0034
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-5-modif'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 64
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 350
+      num_filters: 416
+      num_gaussians: 80
+      num_interactions: 6
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0034
+      scheduler: LinearWarmupCosineAnnealingLR
+  - config: fanet-is2re-all
+    note: 'top-6'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 480
+      num_filters: 352
+      num_gaussians: 72
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: False
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0034
+      scheduler: LinearWarmupCosineAnnealingLR
\ No newline at end of file
diff --git a/configs/exps/icml/is2re-all/top-config.yaml b/configs/exps/icml/is2re-all/top-config.yaml
index 0debe73c19..845ef052ff 100644
--- a/configs/exps/icml/is2re-all/top-config.yaml
+++ b/configs/exps/icml/is2re-all/top-config.yaml
@@ -39,11 +39,11 @@ runs:
     optim:
       lr_initial: 0.0019
       scheduler: LinearWarmupCosineAnnealingLR
-      max_epochs: 9
+      max_epochs: 20
+      eval_every: 0.4
   - config: fanet-is2re-all
     note: 'top-1-FA'
-    frame_averaging: 2D
-    fa_frames: all
+    frame_averaging: DA
     model:
       mp_type: updownscale
       phys_embeds: False
@@ -61,47 +61,279 @@ runs:
     optim:
       lr_initial: 0.0019
       scheduler: LinearWarmupCosineAnnealingLR
-      max_epochs: 9
-  - config: fanet-is2re-all
-    note: 'top-1-FA'
-    frame_averaging: DA
+      max_epochs: 20
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700539
+    note: 'top-3-modif'
+    frame_averaging: 2D
+    fa_frames: random
     model:
-      mp_type: updownscale
+      mp_type: updown_local_env
       phys_embeds: False
       tag_hidden_channels: 32
       pg_hidden_channels: 64
       energy_head: weighted-av-final-embeds
-      complex_mp: False
+      complex_mp: True
       graph_norm: True
-      hidden_channels: 352
-      num_filters: 448
-      num_gaussians: 99
-      num_interactions: 6
+      hidden_channels: 300
+      num_filters: 480
+      num_gaussians: 90
+      num_interactions: 5
       second_layer_MLP: True
       skip_co: concat
-    optim:
-      lr_initial: 0.0019
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.003
       scheduler: LinearWarmupCosineAnnealingLR
-      max_epochs: 9
-  - config: fanet-is2re-all
-    note: 'top-1-FA'
-    frame_averaging: 3D
+      eval_every: 0.4
+      max_epochs: 18
+
+  - config: fanet-is2re-all  # 2700540
+    note: 'top-4-modif'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+      phys_embeds: False
+      tag_hidden_channels: 0
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 384
+      num_filters: 384
+      num_gaussians: 77
+      num_interactions: 4
+      second_layer_MLP: True
+      skip_co: concat-atom
+      cutoff: 8.0
+    optim: 
+      lr_initial: 0.0025
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700541
+    note: 'top-4-DA'
+    frame_averaging: DA
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 384
+      num_filters: 384
+      num_gaussians: 90
+      num_interactions: 4
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0025
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700542
+    note: 'top-5'
+    frame_averaging: 2D
     fa_frames: random
     model:
-      mp_type: updownscale
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 64
+      pg_hidden_channels: 32
+      energy_head: False
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 320
+      num_filters: 416
+      num_gaussians: 36
+      num_interactions: 6
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.0034
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700543
+    note: 'top-5-modif'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 64
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 350
+      num_filters: 416
+      num_gaussians: 80
+      num_interactions: 6
+      second_layer_MLP: False
+      skip_co: concat-atom
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+      
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-6'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 480
+      num_filters: 352
+      num_gaussians: 72
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-01'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 512
+      num_filters: 200
+      num_gaussians: 150
+      num_interactions: 4
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0023
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+  
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-01-modif'
+    frame_averaging: 2D
+    fa_frames: DA
+    model:
+      mp_type: base
+      phys_embeds: False
+      tag_hidden_channels: 32
+      pg_hidden_channels: 64
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 512
+      num_filters: 250
+      num_gaussians: 130
+      num_interactions: 4
+      second_layer_MLP: True
+      skip_co: False
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0023
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-01-modif'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: base
       phys_embeds: False
       tag_hidden_channels: 32
       pg_hidden_channels: 64
       energy_head: weighted-av-final-embeds
       complex_mp: False
       graph_norm: True
-      hidden_channels: 352
-      num_filters: 448
-      num_gaussians: 99
+      hidden_channels: 512
+      num_filters: 300
+      num_gaussians: 130
+      num_interactions: 4
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0023
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-02-modif'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: base
+      phys_embeds: False
+      tag_hidden_channels: 32
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: False
+      graph_norm: True
+      hidden_channels: 275
+      num_filters: 288
+      num_gaussians: 60
       num_interactions: 6
-      second_layer_MLP: True
+      second_layer_MLP: False
+      skip_co: add
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0022
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 20
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-02-modif'
+    frame_averaging: 2D
+    fa_frames: random
+    model:
+      mp_type: base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 32
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 300
+      num_filters: 300
+      num_gaussians: 70
+      num_interactions: 6
+      second_layer_MLP: False
       skip_co: concat
-    optim:
-      lr_initial: 0.0019
+      cutoff: 6.0
+    optim: 
+      lr_initial: 0.0022
       scheduler: LinearWarmupCosineAnnealingLR
-      max_epochs: 9
+      max_epochs: 20
+      eval_every: 0.4
\ No newline at end of file
diff --git a/scripts/gnn_dev.py b/scripts/gnn_dev.py
index 617e54cc18..28c8f55e1f 100644
--- a/scripts/gnn_dev.py
+++ b/scripts/gnn_dev.py
@@ -25,10 +25,11 @@
     config["model"] = {"use_pbc": True}
     config["model"]["edge_embed_type"] = "all_rij"
     config["model"]["mp_type"] = "base"
-    config["model"]["skip_co"] = False
+    config["model"]["skip_co"] = "concat-atom"  # add, concat, 
     config["model"]["att_heads"] = 3
     config["model"]["complex_mp"] = True
     config["model"]["graph_norm"] = True
+    config["optim"]["eval_every"] = 0.5
     # config["model"]["regress_forces"] = "direct_with_gradient_target"
 
     checkpoint_path = None

From f332cdb015d0fd83bdfc99434abcbe9f1e45b82c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 15:45:03 -0500
Subject: [PATCH 191/273] implement qm7x lse data shift

---
 configs/exps/qm7x/schnet-fanet-lse.yaml      | 113 +++++++++++++++++++
 configs/exps/qm7x/schnet-fanet-noenergy.yaml | 108 ++++++++++++++++++
 configs/exps/qm7x/schnet-fanet.yaml          |   5 +
 configs/models/qm7x-metadata/lse-shifts.json |   1 +
 configs/models/tasks/qm7x.yaml               |   5 +-
 ocpmodels/common/utils.py                    |  12 ++
 ocpmodels/datasets/qm7x.py                   |  23 +++-
 ocpmodels/trainers/base_trainer.py           |   8 --
 scripts/compute_qm7x_lse.py                  |  49 ++++++++
 9 files changed, 312 insertions(+), 12 deletions(-)
 create mode 100644 configs/exps/qm7x/schnet-fanet-lse.yaml
 create mode 100644 configs/exps/qm7x/schnet-fanet-noenergy.yaml
 create mode 100644 configs/models/qm7x-metadata/lse-shifts.json
 create mode 100644 scripts/compute_qm7x_lse.py

diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml
new file mode 100644
index 0000000000..97097a6c16
--- /dev/null
+++ b/configs/exps/qm7x/schnet-fanet-lse.yaml
@@ -0,0 +1,113 @@
+# trainset has 4068193 samples
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
+  env: ocp-a100
+
+default:
+  config: schnet-qm7x-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x
+  phys_hidden_channels: 0
+  phys_embeds: False
+  energy_head: False
+  pg_hidden_channels: 0
+  tag_hidden_channels: 0
+  frame_averaging: ""
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions
+    optim: batch_size, lr_initial
+  optim:
+    batch_size: 10
+    max_epochs: 100
+    warmup_steps: 3000
+    lr_initial: 0.0001
+    eval_every: 0.34
+    # parameters EMA
+    ema_decay: 0.999
+    energy_coefficient: 0.
+    energy_grad_coefficient: 0
+    force_coefficient: 1.
+    loss_energy: mae
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.5
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+  model:
+    hidden_channels: 128
+    num_filters: 128
+    num_gaussians: 20
+    num_interactions: 6
+    cutoff: 5.0
+    regress_forces: from_energy
+  dataset:
+    train:
+      rescale_with_hof: False
+      lse_shift: True
+    val_id:
+      lse_shift: True
+    val_ood:
+      lse_shift: True
+
+runs:
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
+
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
+    optim:
+      lr_initial: 0.001
+      batch_size: 100
+
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
+    optim:
+      lr_initial: 0.001
+      batch_size: 256
+
+  - config: fanet-qm7x-all
+    model:
+      graph_norm: true
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
+
+  - config: fanet-qm7x-all
+    optim:
+      lr_initial: 0.001
+      batch_size: 100
+    model:
+      graph_norm: true
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
+      force_decoder_type: mlp
+      regress_forces: direct_with_gradient_target
+
+  - config: fanet-qm7x-all
+    optim:
+      lr_initial: 0.001
+      batch_size: 100
+    model:
+      graph_norm: false
+      force_decoder_type: mlp
+      edge_embed_type: all_rij
+      regress_forces: direct
+      mp_type: updownscale_base
+      num_interactions: 4
diff --git a/configs/exps/qm7x/schnet-fanet-noenergy.yaml b/configs/exps/qm7x/schnet-fanet-noenergy.yaml
new file mode 100644
index 0000000000..bbe89b7f75
--- /dev/null
+++ b/configs/exps/qm7x/schnet-fanet-noenergy.yaml
@@ -0,0 +1,108 @@
+# trainset has 4068193 samples
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
+  env: ocp-a100
+
+default:
+  config: schnet-qm7x-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x
+  phys_hidden_channels: 0
+  phys_embeds: False
+  energy_head: False
+  pg_hidden_channels: 0
+  tag_hidden_channels: 0
+  frame_averaging: ""
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions
+    optim: batch_size, lr_initial
+  optim:
+    batch_size: 10
+    max_epochs: 100
+    warmup_steps: 3000
+    lr_initial: 0.0001
+    eval_every: 0.34
+    # parameters EMA
+    ema_decay: 0.999
+    energy_coefficient: 0.
+    energy_grad_coefficient: 0
+    force_coefficient: 1.
+    loss_energy: mae
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.5
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+  model:
+    hidden_channels: 128
+    num_filters: 128
+    num_gaussians: 20
+    num_interactions: 6
+    cutoff: 5.0
+    regress_forces: from_energy
+  dataset:
+    train:
+      rescale_with_hof: False
+
+runs:
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
+
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
+    optim:
+      lr_initial: 0.001
+      batch_size: 100
+
+  - config: schnet-qm7x-all
+    dataset:
+      train:
+        normalize_labels: True
+    optim:
+      lr_initial: 0.001
+      batch_size: 256
+
+  - config: fanet-qm7x-all
+    model:
+      graph_norm: true
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
+
+  - config: fanet-qm7x-all
+    optim:
+      lr_initial: 0.001
+      batch_size: 100
+    model:
+      graph_norm: true
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
+      force_decoder_type: mlp
+      regress_forces: direct_with_gradient_target
+
+  - config: fanet-qm7x-all
+    optim:
+      lr_initial: 0.001
+      batch_size: 100
+    model:
+      graph_norm: false
+      force_decoder_type: mlp
+      edge_embed_type: all_rij
+      regress_forces: direct
+      mp_type: updownscale_base
+      num_interactions: 4
diff --git a/configs/exps/qm7x/schnet-fanet.yaml b/configs/exps/qm7x/schnet-fanet.yaml
index 10c22056b1..fd91abe050 100644
--- a/configs/exps/qm7x/schnet-fanet.yaml
+++ b/configs/exps/qm7x/schnet-fanet.yaml
@@ -55,6 +55,11 @@ default:
   dataset:
     train:
       rescale_with_hof: True
+      lse_shift: False
+    val_id:
+      lse_shift: False
+    val_ood:
+      lse_shift: False
 
 runs:
   - config: schnet-qm7x-all
diff --git a/configs/models/qm7x-metadata/lse-shifts.json b/configs/models/qm7x-metadata/lse-shifts.json
new file mode 100644
index 0000000000..8893e2a5cc
--- /dev/null
+++ b/configs/models/qm7x-metadata/lse-shifts.json
@@ -0,0 +1 @@
+[0.0, -16.48365429710017, 0.0, 0.0, 0.0, 0.0, -1035.230325647512, -1488.1741712581756, -2045.3532693858685, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -10832.70108036143, -12520.741665730922]
\ No newline at end of file
diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml
index c832e2c82f..3408bf5b8f 100644
--- a/configs/models/tasks/qm7x.yaml
+++ b/configs/models/tasks/qm7x.yaml
@@ -36,7 +36,8 @@ default:
       target: ePBE0+MBD
       forces_target: totFOR
       std_divider: 1.0
-      rescale_with_hof: True
+      rescale_with_hof: False
+      lse_shift: True
     val_id:
       src: /network/projects/ocp/qm7x/processed
       normalize_labels: True # mean and std of target will be set by utils.py if this is True
@@ -44,6 +45,7 @@ default:
       target: ePBE0+MBD
       forces_target: totFOR
       std_divider: 1.0
+      lse_shift: True
     val_ood:
       src: /network/projects/ocp/qm7x/processed
       normalize_labels: True # mean and std of target will be set by utils.py if this is True
@@ -51,6 +53,7 @@ default:
       target: ePBE0+MBD
       forces_target: totFOR
       std_divider: 1.0
+      lse_shift: True
 
     # TEST SET DO NOT ENABLE
     # - src: /network/projects/ocp/qm9
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 2c201644f0..2d79d56f06 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -307,6 +307,9 @@ def continue_from_slurm_job_id(config, from_best=False):
     if config.get("checkpoint"):
         return config
 
+    if config.get("no-resume"):
+        return config
+
     job_id = os.environ.get("SLURM_JOBID")
     if job_id is None:
         return config
@@ -536,6 +539,15 @@ def set_qm7x_target_stats(trainer_config):
             continue
         if not dataset.get("normalize_labels", False):
             continue
+        else:
+            if dataset.get("lse_shift"):
+                print(
+                    "Setting normalize_labels to False because of lse_shift for split",
+                    f"{d}.",
+                )
+                trainer_config["dataset"][d]["normalize_labels"] = False
+                continue
+
         assert "target" in dataset, "target must be specified."
         mean = target_stats[dataset["target"]]["mean"]
         std = target_stats[dataset["target"]]["std"]
diff --git a/ocpmodels/datasets/qm7x.py b/ocpmodels/datasets/qm7x.py
index f98774a60c..97c3b85225 100644
--- a/ocpmodels/datasets/qm7x.py
+++ b/ocpmodels/datasets/qm7x.py
@@ -710,6 +710,7 @@ def __init__(
         },
         transform=None,
     ):
+        self.config = config
         lmdb_path = Path(config["src"]).expanduser().resolve()
         self.lmdb_path = str(lmdb_path)
         if not lmdb_path.exists():
@@ -762,6 +763,20 @@ def __init__(
         self.hofs[np.isnan(self.hofs)] = self.hofs[~np.isnan(self.hofs)].mean()
         self.hofs = torch.from_numpy(self.hofs).float()
 
+        self.lse_shifts = None
+        if self.config.get("lse_shift"):
+            self.lse_shifts = torch.tensor(
+                json.loads(
+                    (
+                        ROOT
+                        / "configs"
+                        / "models"
+                        / "qm7x-metadata"
+                        / "lse-shifts.json"
+                    ).read_text()
+                )
+            )
+
         self.transform = transform
 
     def __len__(self):
@@ -793,9 +808,11 @@ def __getitem__(self, i):
         data.natoms = len(data.pos)
         data.tags = torch.full((data.natoms,), -1, dtype=torch.long)
         data.atomic_numbers = torch.tensor(data.atNUM, dtype=torch.long)
-        data.hofs = self.hofs[
-            data.atomic_numbers.numpy().astype(int) - 1  # element 1 is at row 0
-        ].sum()
+        data.hofs = self.hofs[data.atomic_numbers - 1].sum()  # element 1 is at row 0
+        if self.lse_shifts is not None:
+            data.lse_shift = self.lse_shifts[data.atomic_numbers].sum()
+            data.y_unshifted = data.y
+            data.y = data.y - data.lse_shift
 
         t1 = time.time_ns()
         if self.transform is not None:
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index c22ef01139..bbc647ee65 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -13,7 +13,6 @@
 from abc import ABC, abstractmethod
 from collections import defaultdict
 from copy import deepcopy
-from pathlib import Path
 
 import numpy as np
 import torch
@@ -341,13 +340,6 @@ def load_task(self):
         pass
 
     def load_model(self):
-        # Build model
-        if not self.silent:
-            print(
-                f"🧠 Loading model {self.config['model_name']}:\n"
-                + f" {yaml.dump(self.config['model'])}"
-            )
-
         bond_feat_dim = None
         bond_feat_dim = self.config["model"].get("num_gaussians", 50)
 
diff --git a/scripts/compute_qm7x_lse.py b/scripts/compute_qm7x_lse.py
new file mode 100644
index 0000000000..e831f22723
--- /dev/null
+++ b/scripts/compute_qm7x_lse.py
@@ -0,0 +1,49 @@
+import json
+from pathlib import Path
+import h5py
+from tqdm import tqdm
+import numpy as np
+from sklearn.feature_extraction import DictVectorizer
+
+
+def count_fn(y):
+    return dict(zip(*np.unique(y, return_counts=True)))
+
+
+if __name__ == "__main__":
+    # from  SO3Krates
+    # https://github.com/thorben-frank/mlff/blob/v0.1/mlff/src/data/preprocessing.py#L297
+    base = Path("/network/projects/ocp/qm7x/source")
+    h5_paths = sorted(base.glob("*.hdf5"))
+    h5s = [h5py.File(p, "r") for p in h5_paths]
+    data = [
+        (h5[f"{mol}/{conf}/ePBE0+MBD"][0], h5[f"{mol}/{conf}/atNUM"][:])
+        for i, h5 in enumerate(h5s)
+        for mol in tqdm(h5, desc=f"Reading file {h5_paths[i].name}", leave=False)
+        for conf in tqdm(h5[mol], desc=f"Molecule {mol}", leave=False)
+    ]
+
+    q = np.array([d[0] for d in data])
+    max_n_atoms = max([len(d[1]) for d in data])
+    z = np.array([np.pad(d[1], (0, max_n_atoms - len(d[1]))) for d in data])
+    u = np.unique(z)
+    idx_ = u != 0  # remove padding with 0
+    lhs_counts = list(map(count_fn, z))
+    v = DictVectorizer(sparse=False)
+    X = v.fit_transform(lhs_counts)
+    X = X[..., idx_]
+
+    sol = np.linalg.lstsq(X, q, rcond=None)
+    shifts = np.zeros(np.max(u) + 1)
+    for k, v in dict(zip(u[idx_], sol[0])).items():
+        shifts[k] = v
+
+    (
+        Path(__file__).resolve().parent.parent
+        / "configs"
+        / "models"
+        / "qm7x-metadata"
+        / "lse-shifts.json"
+    ).write_text(json.dumps(shifts.tolist()))
+
+    q_shifts = shifts[z].sum(-1)

From f6573a9d15f6f864794446d285c3ddf76fe230e7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 17:26:36 -0500
Subject: [PATCH 192/273] refactor back without runner and split utils

---
 main.py                              | 230 ++++++++++++---------------
 ocpmodels/common/logger.py           |   1 +
 ocpmodels/common/orion_utils.py      | 227 ++++++++++++++++++++++++++
 ocpmodels/common/utils.py            | 211 +++---------------------
 ocpmodels/trainers/base_trainer.py   |   2 +-
 ocpmodels/trainers/single_trainer.py |   4 +-
 6 files changed, 355 insertions(+), 320 deletions(-)
 create mode 100644 ocpmodels/common/orion_utils.py

diff --git a/main.py b/main.py
index ac749e1d2e..56993e2624 100644
--- a/main.py
+++ b/main.py
@@ -10,9 +10,8 @@
 import os
 import time
 import traceback
-
+import sys
 import torch
-from orion.core.utils.exceptions import ReservationRaceCondition
 from yaml import dump
 
 from ocpmodels.common import dist_utils
@@ -20,21 +19,20 @@
 from ocpmodels.common.registry import registry
 from ocpmodels.common.utils import (
     JOB_ID,
-    apply_mult_factor,
     auto_note,
     build_config,
-    continue_from_slurm_job_id,
-    continue_orion_exp,
-    load_orion_exp,
     merge_dicts,
     move_lmdb_data_to_slurm_tmpdir,
-    read_slurm_env,
     resolve,
-    set_max_fidelity,
     setup_imports,
     setup_logging,
-    unflatten_dict,
     update_from_sbatch_py_vars,
+    set_hidden_channels,
+)
+from ocpmodels.common.orion_utils import (
+    continue_orion_exp,
+    load_orion_exp,
+    sample_orion_hparams,
 )
 from ocpmodels.trainers import BaseTrainer
 
@@ -56,89 +54,119 @@ def print_warnings():
     print("-" * 80 + "\n")
 
 
-class Runner:
-    def __init__(self, trainer_config):
-        self.trainer_config = trainer_config
-        self.trainer = None
-        self.hparams = {}
+def wrap_up(args, start_time, trainer=None, error=None, signal=None):
+
+    total_time = time.time() - start_time
+    logging.info(f"Total time taken: {total_time}")
+    if trainer and trainer.logger is not None:
+        trainer.logger.log({"Total time": total_time})
+
+    if args.distributed:
+        print(
+            "\nWaiting for all processes to finish with dist_utils.cleanup()...",
+            end="",
+        )
+        dist_utils.cleanup()
+        print("Done!")
+
+    if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read():
+        print("\nSelf-canceling SLURM job in 32s", JOB_ID)
+        os.popen(f"sleep 32 && scancel {JOB_ID}")
+
+    if trainer and trainer.logger:
+        trainer.logger.finish(error or signal)
+
+
+if __name__ == "__main__":
+    error = signal = orion_exp = orion_trial = None
+    orion_race_condition = False
+    hparams = {}
+
+    setup_logging()
+
+    parser = flags.get_parser()
+    args, override_args = parser.parse_known_args()
+    args = update_from_sbatch_py_vars(args)
+    if args.logdir:
+        args.logdir = resolve(args.logdir)
+
+    trainer_config = build_config(args, override_args)
+    original_trainer_config = copy.deepcopy(trainer_config)
+
+    if args.distributed:
+        dist_utils.setup(trainer_config)
+        print("Distributed backend setup.")
+
+    if dist_utils.is_master():
+        trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config)
+    dist_utils.synchronize()
+
+    trainer_config["dataset"] = dist_utils.broadcast_from_master(
+        trainer_config["dataset"]
+    )
+
+    # -- Initial setup
+
+    setup_imports()
+    print("All things imported.\n")
+    start_time = time.time()
+
+    try:
+
+        # -- Orion
+
+        if args.orion_exp_config_path and dist_utils.is_master():
+            orion_exp = load_orion_exp(args)
 
-    def run(self, orion_exp=None):
-        orion_trial = signal = None
-        self.original_config = copy.deepcopy(self.trainer_config)
-        orion_race_condition = False
         if dist_utils.is_master():
             if orion_exp:
-                try:
-                    orion_trial = orion_exp.suggest(1)
-                    print(
-                        "\n🚨  Orion reservation race condition detected. Exiting",
-                        "and deleting run dir",
-                    )
-                    self.hparams = set_max_fidelity(
-                        unflatten_dict(
-                            apply_mult_factor(
-                                orion_trial.params,
-                                self.trainer_config.get("orion_mult_factor"),
-                                sep="/",
-                            ),
-                            sep="/",
-                        ),
-                        orion_exp,
-                    )
-                    self.hparams["orion_hash_params"] = orion_trial.hash_params
-                    self.hparams["orion_unique_exp_name"] = orion_exp.name
-                except ReservationRaceCondition:
-                    orion_race_condition = True
-                    import wandb
-
-                    if wandb.run is not None:
-                        if wandb.run.tags:
-                            wandb.run.tags = wandb.run.tags + ("RaceCondition",)
-                        else:
-                            wandb.run.tags = ("RaceCondition",)
-
-        self.hparams, orion_race_condition = dist_utils.broadcast_from_master(
-            self.hparams, orion_race_condition
-        )
-        if self.hparams:
+                hparams = sample_orion_hparams(orion_exp, trainer_config)
+                if hparams.get("orion_race_condition"):
+                    logging.warning("\n\n ⛔️ Orion race condition. Stopping here.\n\n")
+                    wrap_up(args, start_time, error, signal)
+                    sys.exit()
+
+        hparams = dist_utils.broadcast_from_master(hparams)
+        if hparams:
             print("\n💎 Received hyper-parameters from Orion:")
-            print(dump(self.hparams), end="\n")
+            print(dump(hparams), end="\n")
+            trainer_config = merge_dicts(trainer_config, hparams)
+
+        # -- Setup trainer
+
+        trainer_config = continue_orion_exp(trainer_config)
+        trainer_config = auto_note(trainer_config)
+        trainer_config = set_hidden_channels(trainer_config)
 
-        self.trainer_config = merge_dicts(self.trainer_config, self.hparams)
-        self.trainer_config = continue_orion_exp(self.trainer_config)
-        self.trainer_config = auto_note(self.trainer_config)
-        cls = registry.get_trainer_class(self.trainer_config["trainer"])
         try:
-            self.trainer: BaseTrainer = cls(**self.trainer_config)
+            cls = registry.get_trainer_class(trainer_config["trainer"])
+            trainer: BaseTrainer = cls(**trainer_config)
         except Exception as e:
-            print(f"Error in trainer initialization: {e}")
             traceback.print_exc()
+            logging.warning(f"\n💀 Error in trainer initialization: {e}\n")
             signal = "trainer_init_error"
 
-        start_time = time.time()
         if signal is None:
-            task = registry.get_task_class(self.trainer_config["mode"])(
-                self.trainer_config
-            )
-            task.setup(self.trainer)
+            task = registry.get_task_class(trainer_config["mode"])(trainer_config)
+            task.setup(trainer)
             print_warnings()
 
+            # -- Start Training
+
             signal = task.run()
 
+        # -- End of training
+
         # handle job preemption / time limit
         if signal == "SIGTERM":
             print("\nJob was preempted. Wrapping up...\n")
-            if self.trainer:
-                self.trainer.close_datasets()
+            if trainer:
+                trainer.close_datasets()
 
         dist_utils.synchronize()
-        total_time = time.time() - start_time
-        logging.info(f"Total time taken: {total_time}")
-        if self.trainer and self.trainer.logger is not None:
-            self.trainer.logger.log({"Total time": total_time})
 
         objective = dist_utils.broadcast_from_master(
-            self.trainer.objective if self.trainer else None
+            trainer.objective if trainer else None
         )
 
         if orion_exp is not None:
@@ -160,69 +188,9 @@ def run(self, orion_exp=None):
                     [{"type": "objective", "name": "energy_mae", "value": objective}],
                 )
 
-
-if __name__ == "__main__":
-    runner = error = signal = None
-
-    setup_logging()
-
-    parser = flags.get_parser()
-    args, override_args = parser.parse_known_args()
-    args = update_from_sbatch_py_vars(args)
-    if args.logdir:
-        args.logdir = resolve(args.logdir)
-
-    trainer_config = build_config(args, override_args)
-    trainer_config["optim"]["eval_batch_size"] = trainer_config["optim"]["batch_size"]
-
-    original_trainer_config = copy.deepcopy(trainer_config)
-
-    if args.distributed:
-        dist_utils.setup(trainer_config)
-        print("Distributed backend setup.")
-
-    if dist_utils.is_master():
-        trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config)
-        # dist_utils.synchronize()
-
-    # -------------------
-    # -----  Setup  -----
-    # -------------------
-    setup_imports()
-    print("All things imported.")
-    trainer_config = continue_from_slurm_job_id(trainer_config)
-    trainer_config = read_slurm_env(trainer_config)
-    runner = Runner(trainer_config)
-    print("Runner ready.")
-
-    try:
-        # -------------------
-        # -----  Train  -----
-        # -------------------
-        if args.orion_exp_config_path and dist_utils.is_master():
-            experiment = load_orion_exp(args)
-            print("\nStarting runner.")
-            runner.run(orion_exp=experiment)
-        else:
-            print("Starting runner.")
-            runner.run()
-
     except Exception:
         error = True
         print(traceback.format_exc())
 
     finally:
-        if args.distributed:
-            print(
-                "\nWaiting for all processes to finish with dist_utils.cleanup()...",
-                end="",
-            )
-            dist_utils.cleanup()
-            print("Done!")
-
-        if "interactive" not in os.popen(f"squeue -hj {JOB_ID}").read():
-            print("\nSelf-canceling SLURM job in 32s", JOB_ID)
-            os.popen(f"sleep 32 && scancel {JOB_ID}")
-
-        if runner and runner.trainer and runner.trainer.logger:
-            runner.trainer.logger.finish(error or signal)
+        wrap_up(args, start_time, error, signal, trainer=trainer)
diff --git a/ocpmodels/common/logger.py b/ocpmodels/common/logger.py
index b628704fcf..a42eb6eeeb 100644
--- a/ocpmodels/common/logger.py
+++ b/ocpmodels/common/logger.py
@@ -138,6 +138,7 @@ def __init__(self, trainer_config):
         if not CLUSTER.drac:
             self.collect_output_files(policy="live")
             self.collect_output_files(policy="end")
+        print(f"\n{'-'*80}\n")
 
     def watch(self, model):
         wandb.watch(model)
diff --git a/ocpmodels/common/orion_utils.py b/ocpmodels/common/orion_utils.py
new file mode 100644
index 0000000000..7f44cb5683
--- /dev/null
+++ b/ocpmodels/common/orion_utils.py
@@ -0,0 +1,227 @@
+import copy
+import os
+import time
+from pathlib import Path
+from shutil import copyfile, move
+
+import yaml
+from orion.client import build_experiment
+from orion.core.utils.exceptions import ReservationRaceCondition
+
+from ocpmodels.common.utils import ROOT, RUN_DIR, unflatten_dict
+
+
+def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."):
+    """
+    Multiplies all values of orion_hparams listed in mult_factor_dict["targets"]
+    by mult_factor_dict["value"].
+
+    eg:
+    >>> orion_hparams = {
+        "model/hidden_channels": 4,
+        "model/num_layers": 4,
+        "optim/batch_size": 4,
+        "optim/initial_lr": 0.001,
+        "frame_averaging": "",
+    }
+
+    >>> mult_factor_dict = {"value": 32, "targets": "hidden_channels, batch_size"}
+
+    >>> apply_mult_factor(orion_hparams, mult_factor_dict, sep="/")
+    {
+        "model/hidden_channels": 128,
+        "model/num_layers": 4,
+        "optim/batch_size": 128,
+        "optim/initial_lr": 0.001,
+        "frame_averaging": ""
+    }
+
+    Args:
+        orion_hparams (_type_): _description_
+        mult_factor_dict (_type_): _description_
+        sep (str, optional): _description_. Defaults to ".".
+
+    Returns:
+        _type_: _description_
+    """
+    if not mult_factor_dict:
+        return orion_hparams
+    if not isinstance(mult_factor_dict, dict):
+        print(
+            f">>> Warning: ignoring apply_mult_factor, not a dict: {mult_factor_dict}."
+        )
+    if "value" not in mult_factor_dict or "targets" not in mult_factor_dict:
+        print(
+            ">>> Warning: ignoring apply_mult_factor, "
+            + " missing 'value' or 'targets' keys: {}.".format(mult_factor_dict)
+        )
+    value, targets = mult_factor_dict["value"], mult_factor_dict["targets"]
+    targets = set([t.strip() for t in targets.split(",")])
+    updated_hparams = copy.deepcopy(orion_hparams)
+    for k, v in orion_hparams.items():
+        target = k.split(sep)[-1]
+        if target in targets:
+            updated_hparams[k] = v * value
+    return updated_hparams
+
+
+def set_max_fidelity(hparams, orion_exp):
+    for p, prior in orion_exp.space.items():
+        if prior.type == "fidelity":
+            keys = p.split("/")
+            if len(keys) == 1:
+                hparams[f"fidelity_{p}"] = prior.high
+            elif len(keys) == 2:
+                if keys[0] not in hparams:
+                    hparams[keys[0]] = {}
+                hparams[keys[0]][f"fidelity_{keys[1]}"] = prior.high
+            else:
+                print("Error: fidelity parameters must be at most 2 levels deep.")
+    return hparams
+
+
+def sample_orion_hparams(orion_exp, trainer_config):
+    hparams = {}
+    try:
+        orion_trial = orion_exp.suggest(1)
+        print(
+            "\n🚨  Orion reservation race condition detected. Exiting",
+            "and deleting run dir",
+        )
+        hparams = set_max_fidelity(
+            unflatten_dict(
+                apply_mult_factor(
+                    orion_trial.params,
+                    trainer_config.get("orion_mult_factor"),
+                    sep="/",
+                ),
+                sep="/",
+            ),
+            orion_exp,
+        )
+        hparams["orion_hash_params"] = orion_trial.hash_params
+        hparams["orion_unique_exp_name"] = orion_exp.name
+    except ReservationRaceCondition:
+        hparams["orion_race_condition"] = True
+        import wandb
+
+        if wandb.run is not None:
+            if wandb.run.tags:
+                wandb.run.tags = wandb.run.tags + ("RaceCondition",)
+            else:
+                wandb.run.tags = ("RaceCondition",)
+    return hparams
+
+
+def get_and_move_orion_db_path(exp_name):
+    db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."])
+    db_file = f"{db_id}_db.pkl" if not db_id.endswith("_db.pkl") else db_id
+    scratch_db = RUN_DIR.parent / "orion" / "storage" / db_file
+    scratch_db.parent.mkdir(parents=True, exist_ok=True)
+    if not scratch_db.exists():
+        home_db = ROOT / f"data/orion/storage/{db_file}"
+
+        if not home_db.exists():
+            return scratch_db
+
+        lock_file = home_db.parent / f"{db_file}.cp_lock"
+        if not lock_file.exists():
+            lock_file.touch()
+            copyfile(home_db, scratch_db)
+            move(home_db, home_db.parent / f"{db_file}.bak")
+            os.symlink(str(scratch_db), str(home_db))
+            print("Copied and symlinked db from home to scratch.")
+            lock_file.unlink()
+
+        while lock_file.exists():
+            print("Waiting for lock to be released...")
+            time.sleep(1)
+
+    return scratch_db
+
+
+def load_orion_exp(args):
+    exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text())
+
+    assert args.orion_unique_exp_name or exp_config.get(
+        "unique_exp_name"
+    ), "Must provide orion_unique_exp_name in the command-line or the config file."
+
+    print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}")
+    exp_name = args.orion_unique_exp_name or exp_config["unique_exp_name"]
+    db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."])
+    db_path = get_and_move_orion_db_path(db_id)
+    experiment = build_experiment(
+        storage={
+            "database": {
+                "host": str(db_path),
+                "type": "pickleddb",
+            }
+        },
+        name=exp_name,
+        space=exp_config["space"],
+        algorithms=exp_config["algorithms"],
+    )
+    return experiment
+
+
+def continue_orion_exp(trainer_config):
+    if not trainer_config.get("orion_exp_config_path"):
+        return trainer_config
+
+    if "orion_hash_params" not in trainer_config:
+        faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml"
+        print(
+            "\n\nWARNING: trainer_config has 'orion_exp_config_path'",
+            "but no 'orion_hash_params'.",
+            "This can lead to inconsistencies.",
+            f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n",
+        )
+        faulty_path.write_text(yaml.dump(trainer_config))
+        return trainer_config
+
+    hash_params = trainer_config["orion_hash_params"]
+    exp_name = trainer_config["orion_unique_exp_name"]
+    id_file = f"{exp_name}--{hash_params}.unique"
+    (Path(trainer_config["run_dir"]) / id_file).touch()
+    base_dir = Path(trainer_config["run_dir"]).parent
+    existing_id_files = list(base_dir.glob(f"*/{id_file}"))
+
+    latest_dirs = sorted(
+        [
+            f.parent
+            for f in existing_id_files
+            if float(f.parent.name) != float(trainer_config["job_id"])
+        ],
+        key=lambda f: float(f.name),
+    )
+
+    if not latest_dirs:
+        print("\n😅 No previous Orion trial matched for unique file: ", id_file)
+        return trainer_config
+
+    resume_dir = latest_dirs[-1]
+
+    resume_ckpts = sorted(
+        [f for f in (resume_dir / "checkpoints").glob("checkpoint-*")],
+        key=lambda f: float(f.stem.split("-")[-1]),
+    )
+
+    if not resume_ckpts:
+        print(f"🥶 Warning: No checkpoint found in {str(resume_dir)}. Not resuming.")
+        return trainer_config
+
+    trainer_config["checkpoint"] = str(resume_ckpts[-1])
+    resume_url = (resume_dir / "wandb_url.txt").read_text().strip()
+    trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1]
+
+    print(
+        f"\n🎁 Found {len(resume_ckpts)} existing Orion runs.",
+        "Resuming from latest:",
+        str(resume_dir),
+        "\nOn wandb run:",
+        resume_url,
+    )
+    print("Based on unique file id:", id_file)
+    print("Continuing from checkpoint:", trainer_config["checkpoint"], end="\n\n")
+    return trainer_config
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 2d79d56f06..75ca062e74 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -24,7 +24,6 @@
 from functools import wraps
 from itertools import product
 from pathlib import Path
-from shutil import copyfile, move
 
 import numpy as np
 import torch
@@ -32,7 +31,6 @@
 import yaml
 from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
 from matplotlib.figure import Figure
-from orion.client import build_experiment
 from torch_geometric.data import Data
 from torch_geometric.utils import remove_self_loops
 from torch_scatter import segment_coo, segment_csr
@@ -70,189 +68,6 @@ def __getattr__(self, k: str):
 RUN_DIR = Path(os.environ["SCRATCH"]) / "ocp" / "runs"
 
 
-def set_max_fidelity(hparams, orion_exp):
-    for p, prior in orion_exp.space.items():
-        if prior.type == "fidelity":
-            keys = p.split("/")
-            if len(keys) == 1:
-                hparams[f"fidelity_{p}"] = prior.high
-            elif len(keys) == 2:
-                if keys[0] not in hparams:
-                    hparams[keys[0]] = {}
-                hparams[keys[0]][f"fidelity_{keys[1]}"] = prior.high
-            else:
-                print("Error: fidelity parameters must be at most 2 levels deep.")
-    return hparams
-
-
-def apply_mult_factor(orion_hparams, mult_factor_dict, sep="."):
-    """
-    Multiplies all values of orion_hparams listed in mult_factor_dict["targets"]
-    by mult_factor_dict["value"].
-
-    eg:
-    >>> orion_hparams = {
-        "model/hidden_channels": 4,
-        "model/num_layers": 4,
-        "optim/batch_size": 4,
-        "optim/initial_lr": 0.001,
-        "frame_averaging": "",
-    }
-
-    >>> mult_factor_dict = {"value": 32, "targets": "hidden_channels, batch_size"}
-
-    >>> apply_mult_factor(orion_hparams, mult_factor_dict, sep="/")
-    {
-        "model/hidden_channels": 128,
-        "model/num_layers": 4,
-        "optim/batch_size": 128,
-        "optim/initial_lr": 0.001,
-        "frame_averaging": ""
-    }
-
-    Args:
-        orion_hparams (_type_): _description_
-        mult_factor_dict (_type_): _description_
-        sep (str, optional): _description_. Defaults to ".".
-
-    Returns:
-        _type_: _description_
-    """
-    if not mult_factor_dict:
-        return orion_hparams
-    if not isinstance(mult_factor_dict, dict):
-        print(
-            f">>> Warning: ignoring apply_mult_factor, not a dict: {mult_factor_dict}."
-        )
-    if "value" not in mult_factor_dict or "targets" not in mult_factor_dict:
-        print(
-            ">>> Warning: ignoring apply_mult_factor, "
-            + " missing 'value' or 'targets' keys: {}.".format(mult_factor_dict)
-        )
-    value, targets = mult_factor_dict["value"], mult_factor_dict["targets"]
-    targets = set([t.strip() for t in targets.split(",")])
-    updated_hparams = copy.deepcopy(orion_hparams)
-    for k, v in orion_hparams.items():
-        target = k.split(sep)[-1]
-        if target in targets:
-            updated_hparams[k] = v * value
-    return updated_hparams
-
-
-def get_and_move_orion_db_path(exp_name):
-    db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."])
-    db_file = f"{db_id}_db.pkl" if not db_id.endswith("_db.pkl") else db_id
-    scratch_db = RUN_DIR.parent / "orion" / "storage" / db_file
-    scratch_db.parent.mkdir(parents=True, exist_ok=True)
-    if not scratch_db.exists():
-        home_db = ROOT / f"data/orion/storage/{db_file}"
-
-        if not home_db.exists():
-            return scratch_db
-
-        lock_file = home_db.parent / f"{db_file}.cp_lock"
-        if not lock_file.exists():
-            lock_file.touch()
-            copyfile(home_db, scratch_db)
-            move(home_db, home_db.parent / f"{db_file}.bak")
-            os.symlink(str(scratch_db), str(home_db))
-            print("Copied and symlinked db from home to scratch.")
-            lock_file.unlink()
-
-        while lock_file.exists():
-            print("Waiting for lock to be released...")
-            time.sleep(1)
-
-    return scratch_db
-
-
-def load_orion_exp(args):
-    exp_config = yaml.safe_load(Path(args.orion_exp_config_path).read_text())
-
-    assert args.orion_unique_exp_name or exp_config.get(
-        "unique_exp_name"
-    ), "Must provide orion_unique_exp_name in the command-line or the config file."
-
-    print(f"🔎 Orion Experiment Config:\n{yaml.dump(exp_config)}")
-    exp_name = args.orion_unique_exp_name or exp_config["unique_exp_name"]
-    db_id = "".join([c for c in exp_name if c.isalnum() or c in "_-."])
-    db_path = get_and_move_orion_db_path(db_id)
-    experiment = build_experiment(
-        storage={
-            "database": {
-                "host": str(db_path),
-                "type": "pickleddb",
-            }
-        },
-        name=exp_name,
-        space=exp_config["space"],
-        algorithms=exp_config["algorithms"],
-    )
-    return experiment
-
-
-def continue_orion_exp(trainer_config):
-    if not trainer_config.get("orion_exp_config_path"):
-        return trainer_config
-
-    if "orion_hash_params" not in trainer_config:
-        faulty_path = Path(trainer_config["run_dir"]) / "faulty_trainer_config.yaml"
-        print(
-            "\n\nWARNING: trainer_config has 'orion_exp_config_path'",
-            "but no 'orion_hash_params'.",
-            "This can lead to inconsistencies.",
-            f"You should investigate the faulty config in:\n{str(faulty_path)}\n\n",
-        )
-        faulty_path.write_text(yaml.dump(trainer_config))
-        return trainer_config
-
-    hash_params = trainer_config["orion_hash_params"]
-    exp_name = trainer_config["orion_unique_exp_name"]
-    id_file = f"{exp_name}--{hash_params}.unique"
-    (Path(trainer_config["run_dir"]) / id_file).touch()
-    base_dir = Path(trainer_config["run_dir"]).parent
-    existing_id_files = list(base_dir.glob(f"*/{id_file}"))
-
-    latest_dirs = sorted(
-        [
-            f.parent
-            for f in existing_id_files
-            if float(f.parent.name) != float(trainer_config["job_id"])
-        ],
-        key=lambda f: float(f.name),
-    )
-
-    if not latest_dirs:
-        print("\n😅 No previous Orion trial matched for unique file: ", id_file)
-        return trainer_config
-
-    resume_dir = latest_dirs[-1]
-
-    resume_ckpts = sorted(
-        [f for f in (resume_dir / "checkpoints").glob("checkpoint-*")],
-        key=lambda f: float(f.stem.split("-")[-1]),
-    )
-
-    if not resume_ckpts:
-        print(f"🥶 Warning: No checkpoint found in {str(resume_dir)}. Not resuming.")
-        return trainer_config
-
-    trainer_config["checkpoint"] = str(resume_ckpts[-1])
-    resume_url = (resume_dir / "wandb_url.txt").read_text().strip()
-    trainer_config["wandb_resume_id"] = resume_url.split("/runs/")[-1]
-
-    print(
-        f"\n🎁 Found {len(resume_ckpts)} existing Orion runs.",
-        "Resuming from latest:",
-        str(resume_dir),
-        "\nOn wandb run:",
-        resume_url,
-    )
-    print("Based on unique file id:", id_file)
-    print("Continuing from checkpoint:", trainer_config["checkpoint"], end="\n\n")
-    return trainer_config
-
-
 def read_slurm_env(config):
     """
     Parses the output of `scontrol show` in order to store the slurm
@@ -1089,6 +904,29 @@ def check_regress_forces(config):
             )
 
 
+def set_hidden_channels(config):
+    # Embedding(
+    #         85,
+    #         hidden_channels
+    #         - tag_hidden_channels
+    #         - phys_hidden_channels
+    #         - 2 * pg_hidden_channels,
+    #     )
+    hc = config["model"].get("hidden_channels", 0)
+    thc = config["model"].get("tag_hidden_channels", 0)
+    phc = config["model"].get("phys_hidden_channels", 0) * int(
+        config["model"].get("phys_embeds", 0)
+    )
+    pghc = config["model"].get("pg_hidden_channels", 0)
+
+    if hc - thc - phc - 2 * pghc < 0:
+        hc = thc + phc + 2 * pghc + 32
+        print(f"WARNING: hidden_channels is too small. Setting it to {hc}")
+        config["model"]["hidden_channels"] = hc
+
+    return config
+
+
 def load_config(config_str):
     model, task, split = config_str.split("-")
     conf_path = ROOT / "configs" / "models"
@@ -1194,6 +1032,9 @@ def build_config(args, args_override):
     config = set_qm9_target_stats(config)
     config = set_qm7x_target_stats(config)
     config = override_drac_paths(config)
+    config = continue_from_slurm_job_id(config)
+    config = read_slurm_env(config)
+    config["optim"]["eval_batch_size"] = config["optim"]["batch_size"]
 
     return config
 
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index bbc647ee65..9963a299cb 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -145,7 +145,7 @@ def __init__(self, **kwargs):
             self.hpo_checkpoint_every = self.config["optim"].get("checkpoint_every", -1)
 
         if dist_utils.is_master() and not self.silent:
-            print("🧰 Trainer config:")
+            print(f"\n🧰 Trainer config:\n{'-'*17}\n")
             print(yaml.dump(self.config), end="\n\n")
         self.load()
 
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 74c8217681..bdb5a8908f 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -47,8 +47,6 @@ def now(self):
         return str(datetime.datetime.now()).split(".")[0]
 
     def load_task(self):
-        if not self.silent:
-            print(f"Loading dataset: {self.config['task']['dataset']}")
         self.num_targets = 1
 
         # start imports from
@@ -220,7 +218,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
         model_run_time = 0
 
         if not self.silent:
-            print(f"\n--- 🔄 Beginning of Training @ {self.now}---\n")
+            print(f"\n--- 🔄 Beginning of Training @ {self.now} ---\n")
             print(f"\nLogging  train metrics every {log_train_every} steps")
             print(f"Printing train metrics every {self.config['print_every']} steps")
             print(f"\nEvaluating every {eval_every} steps\n")

From 7aa4fb6a53fc364c262e9f7273ec23fba07cd276 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 17:28:41 -0500
Subject: [PATCH 193/273] fix prints

---
 main.py                            | 4 ++--
 ocpmodels/trainers/base_trainer.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/main.py b/main.py
index 56993e2624..467b001694 100644
--- a/main.py
+++ b/main.py
@@ -46,12 +46,12 @@ def print_warnings():
         "`tag_specific_weights` is not handled for "
         + "`regress_forces: direct_with_gradient_target` in compute_loss()",
     ]
-    print("\n" + "-" * 80)
+    print("\n" + "-" * 80 + "\n")
     print("🛑  OCP-DR-Lab Warnings (nota benes):")
     for warning in warnings:
         print(f"  • {warning}")
     print("Remove warnings when they are fixed in the code/configs.")
-    print("-" * 80 + "\n")
+    print("\n" + "-" * 80 + "\n")
 
 
 def wrap_up(args, start_time, trainer=None, error=None, signal=None):
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 9963a299cb..10c252ed08 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -145,7 +145,7 @@ def __init__(self, **kwargs):
             self.hpo_checkpoint_every = self.config["optim"].get("checkpoint_every", -1)
 
         if dist_utils.is_master() and not self.silent:
-            print(f"\n🧰 Trainer config:\n{'-'*17}\n")
+            print(f"\n🧰 Trainer config:\n{'-'*18}\n")
             print(yaml.dump(self.config), end="\n\n")
         self.load()
 

From 14ea1f3858291c92cb8ccdaf014bf9d5c4976815 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 17:34:18 -0500
Subject: [PATCH 194/273] add `keep_orion_config` flag

---
 main.py                   | 2 ++
 ocpmodels/common/flags.py | 6 ++++++
 ocpmodels/common/utils.py | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/main.py b/main.py
index 467b001694..4f2088f6e4 100644
--- a/main.py
+++ b/main.py
@@ -90,6 +90,8 @@ def wrap_up(args, start_time, trainer=None, error=None, signal=None):
     if args.logdir:
         args.logdir = resolve(args.logdir)
 
+    # -- Build config
+
     trainer_config = build_config(args, override_args)
     original_trainer_config = copy.deepcopy(trainer_config)
 
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 2a8caa3e91..6b35f7711a 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -95,6 +95,12 @@ def add_core_args(self):
             help="Run to restart, loading its config and overwriting "
             + "from the command-line",
         )
+        self.parser.add_argument(
+            "--keep_orion_config",
+            type=bool,
+            help="If not True, any key in the continued/restarted config that contains"
+            + " ``orion`` will be set to ``None``",
+        )
         self.parser.add_argument(
             "--timestamp-id",
             default=None,
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 75ca062e74..1d813441a5 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -990,6 +990,10 @@ def build_config(args, args_override):
             if args.continue_from_dir:
                 continue_config["checkpoint"] = str(latest_ckpt)
             continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
+            if not args.keep_orion_config:
+                for k in continue_config:
+                    if "orion" in k:
+                        continue_config[k] = None
             print(
                 f"✅ Loading config from directory {str(cont_dir)}"
                 + (

From 676929b449aa3abf0d61b3dd8f999bbf18da1214 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 17:42:47 -0500
Subject: [PATCH 195/273] fix normalizers for hof

---
 configs/exps/qm7x/schnet-fanet-lse.yaml |  9 ---------
 ocpmodels/common/utils.py               |  2 +-
 ocpmodels/modules/normalizer.py         | 20 +++++++++++---------
 ocpmodels/trainers/base_trainer.py      |  4 ++--
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml
index 97097a6c16..b101084196 100644
--- a/configs/exps/qm7x/schnet-fanet-lse.yaml
+++ b/configs/exps/qm7x/schnet-fanet-lse.yaml
@@ -63,22 +63,13 @@ default:
 
 runs:
   - config: schnet-qm7x-all
-    dataset:
-      train:
-        normalize_labels: True
 
   - config: schnet-qm7x-all
-    dataset:
-      train:
-        normalize_labels: True
     optim:
       lr_initial: 0.001
       batch_size: 100
 
   - config: schnet-qm7x-all
-    dataset:
-      train:
-        normalize_labels: True
     optim:
       lr_initial: 0.001
       batch_size: 256
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 1d813441a5..f3ea8c23ad 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -380,7 +380,7 @@ def set_qm7x_target_stats(trainer_config):
     if "train" in trainer_config["dataset"] and trainer_config["dataset"]["train"].get(
         "rescale_with_hof"
     ):
-        trainer_config["dataset"]["train"]["hof_rescales"] = hof_stats
+        trainer_config["dataset"]["train"]["hof_stats"] = hof_stats
 
     return trainer_config
 
diff --git a/ocpmodels/modules/normalizer.py b/ocpmodels/modules/normalizer.py
index df2830e276..f70c6dcf17 100644
--- a/ocpmodels/modules/normalizer.py
+++ b/ocpmodels/modules/normalizer.py
@@ -32,6 +32,7 @@ def __init__(self, tensor=None, mean=None, std=None, device=None):
 
         self.hof_mean = None
         self.hof_std = None
+        self.rescale_with_hof = False
 
     def to(self, device):
         self.mean = self.mean.to(device)
@@ -43,19 +44,19 @@ def to(self, device):
         self.device = device
 
     def norm(self, tensor, hofs=None):
-        if hofs is not None:
+        if hofs is not None and self.rescale_with_hof:
             return tensor / hofs - self.hof_mean
         return (tensor - self.mean) / self.std
 
     def denorm(self, normed_tensor, hofs=None):
-        if hofs is not None:
+        if hofs is not None and self.rescale_with_hof:
             return (normed_tensor + self.hof_mean) * hofs
         return normed_tensor * self.std + self.mean
 
     def state_dict(self):
         sd = {"mean": self.mean, "std": self.std}
-        if self.hof_mean is not None:
-            sd["hof_rescales"] = {
+        if self.rescale_with_hof:
+            sd["hof_stats"] = {
                 "mean": self.hof_mean,
                 "std": self.hof_std,
             }
@@ -64,9 +65,10 @@ def state_dict(self):
     def load_state_dict(self, state_dict):
         self.mean = state_dict["mean"].to(self.mean.device)
         self.std = state_dict["std"].to(self.mean.device)
-        if "hof_rescales" in state_dict:
-            self.set_hof_rescales(state_dict["hof_rescales"])
+        if "hof_stats" in state_dict:
+            self.set_hof_rescales(state_dict["hof_stats"])
 
-    def set_hof_rescales(self, hof_rescales):
-        self.hof_mean = torch.tensor(hof_rescales["mean"], device=self.device)
-        self.hof_std = torch.tensor(hof_rescales["std"], device=self.device)
+    def set_hof_rescales(self, hof_stats):
+        self.hof_mean = torch.tensor(hof_stats["mean"], device=self.device)
+        self.hof_std = torch.tensor(hof_stats["std"], device=self.device)
+        self.rescale_with_hof = True
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 10c252ed08..901908345a 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -319,9 +319,9 @@ def load_datasets(self):
                     std=self.normalizer["target_std"],
                     device=self.device,
                 )
-                if "hof_rescales" in self.normalizer:
+                if "hof_stats" in self.normalizer:
                     self.normalizers["target"].set_hof_rescales(
-                        self.normalizer["hof_rescales"]
+                        self.normalizer["hof_stats"]
                     )
             else:
                 self.normalizers["target"] = Normalizer(

From 6798ecded51f62e5fe3b1890cfca518afa378c2e Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 18:32:12 -0500
Subject: [PATCH 196/273] + set_hidden_channels

---
 main.py                   | 4 ++--
 ocpmodels/common/utils.py | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index 4f2088f6e4..1290ca79f4 100644
--- a/main.py
+++ b/main.py
@@ -54,7 +54,7 @@ def print_warnings():
     print("\n" + "-" * 80 + "\n")
 
 
-def wrap_up(args, start_time, trainer=None, error=None, signal=None):
+def wrap_up(args, start_time, error=None, signal=None, trainer=None):
 
     total_time = time.time() - start_time
     logging.info(f"Total time taken: {total_time}")
@@ -78,7 +78,7 @@ def wrap_up(args, start_time, trainer=None, error=None, signal=None):
 
 
 if __name__ == "__main__":
-    error = signal = orion_exp = orion_trial = None
+    error = signal = orion_exp = orion_trial = trainer = None
     orion_race_condition = False
     hparams = {}
 
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index f3ea8c23ad..b344c639ff 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -914,9 +914,8 @@ def set_hidden_channels(config):
     #     )
     hc = config["model"].get("hidden_channels", 0)
     thc = config["model"].get("tag_hidden_channels", 0)
-    phc = config["model"].get("phys_hidden_channels", 0) * int(
-        config["model"].get("phys_embeds", 0)
-    )
+    phc = config["model"].get("phys_hidden_channels", 0) or 14
+    phc *= int(config["model"].get("phys_embeds", 0))
     pghc = config["model"].get("pg_hidden_channels", 0)
 
     if hc - thc - phc - 2 * pghc < 0:

From 0f276bad75f22bdbec5f22d718689af62ebf786e Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 18:32:22 -0500
Subject: [PATCH 197/273] qm9 orion v5

---
 configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml | 73 +++++++++++++++++++
 1 file changed, 73 insertions(+)
 create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml
new file mode 100644
index 0000000000..b52a2003aa
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v5.yaml
@@ -0,0 +1,73 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 8GB
+  cpus: 4
+  gres: gpu:1
+  time: 02:50:00
+  partition: long
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, orion
+  log_train_every: 200
+  optim:
+    batch_size: 32
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mse
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.000001
+    es_warmup_epochs: 800
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+    patience: 15
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: num_filters, pg_hidden_channels, num_gaussians
+  frame_averaging: 3D
+  fa_frames: random
+  model:
+    edge_embed_type: all_rij
+    energy_head: ""
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 20
+
+  unique_exp_name: fanet-qm9-v5.0.0
+
+  space:
+    optim/max_epochs: fidelity(200, 2000, base=5)
+    optim/lr_initial: loguniform(1e-4, 6e-4, precision=3)
+    model/graph_norm: choices([True, False])
+    model/skip_co: choices([True, False])
+    model/second_layer_mlp: choices([True, False])
+    model/hidden_channels: uniform(6, 15, discrete=True)
+    model/mp_type: choices(["simple", "base", "updownscale", "updownscale_base"])
+    model/num_filters: uniform(4, 16, discrete=True)
+    model/num_gaussians: uniform(1, 4, discrete=True)
+    model/num_interactions: uniform(3, 5, discrete=True)
+    model/pg_hidden_channels: uniform(0, 1, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/max_num_neighbours: choices([30, 40, 50])
+    model/cutoff: uniform(4, 6, precision=1)
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2

From b83da002f17d3bd21eae6098063665019f671a8d Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 18:34:59 -0500
Subject: [PATCH 198/273] add = in sbatch rundir and logdir

---
 sbatch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sbatch.py b/sbatch.py
index 0d6affdfbe..bf17bd5c6f 100644
--- a/sbatch.py
+++ b/sbatch.py
@@ -260,10 +260,10 @@ def write_orion_config(args, outdir):
 
     # add logdir to main.py's command-line arguments
     if "--logdir" not in args.py_args and args.logdir:
-        args.py_args += f" --logdir {args.logdir}"
+        args.py_args += f" --logdir={args.logdir}"
     # add run-dir to main.py's command-line arguments
     if "--run-dir" not in args.py_args and args.logdir:
-        args.py_args += f" --run-dir {args.logdir}"
+        args.py_args += f" --run-dir={args.logdir}"
 
     if "--note" not in args.py_args and args.note:
         note = args.note.replace('"', '\\"')

From 4b7b9a2a21e508579b2cbba1c1f59adbc8e84a64 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 18:43:33 -0500
Subject: [PATCH 199/273] add job_name from exp_name for slurm

---
 launch_exp.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/launch_exp.py b/launch_exp.py
index 8ab6d86867..665cbba6bd 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -197,6 +197,7 @@ def get_args_or_exp(key, args, exp):
         runs = [
             {
                 "orion_exp_config_path": str(search_path),
+                "job_name": unique_exp_name,
             }
             for _ in range(n_jobs)
         ]

From 8e8b3729a88378f498db2e9ef66bb6bf74a0ffe4 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 18:45:39 -0500
Subject: [PATCH 200/273] auto job_name from exp_name if no
 orion_unique_exp_name

---
 launch_exp.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/launch_exp.py b/launch_exp.py
index 665cbba6bd..dd7d4110ad 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -197,7 +197,9 @@ def get_args_or_exp(key, args, exp):
         runs = [
             {
                 "orion_exp_config_path": str(search_path),
-                "job_name": unique_exp_name,
+                "job": {
+                    "job_name": unique_exp_name,
+                },
             }
             for _ in range(n_jobs)
         ]
@@ -215,6 +217,9 @@ def get_args_or_exp(key, args, exp):
         if "time" in job:
             job["time"] = seconds_to_time_str(job["time"])
 
+        if "job_name" not in job:
+            job["job_name"] = exp_name
+
         if "wandb_tags" in params:
             params["wandb_tags"] += "," + exp_name
         else:

From 55b738a4be85c697351665ef99565142de1b7ee6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 20:20:50 -0500
Subject: [PATCH 201/273] shift None observation print

---
 main.py                         | 6 +++---
 ocpmodels/common/orion_utils.py | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/main.py b/main.py
index 1290ca79f4..c7c5f30d45 100644
--- a/main.py
+++ b/main.py
@@ -122,7 +122,7 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None):
 
         if dist_utils.is_master():
             if orion_exp:
-                hparams = sample_orion_hparams(orion_exp, trainer_config)
+                hparams, orion_trial = sample_orion_hparams(orion_exp, trainer_config)
                 if hparams.get("orion_race_condition"):
                     logging.warning("\n\n ⛔️ Orion race condition. Stopping here.\n\n")
                     wrap_up(args, start_time, error, signal)
@@ -182,13 +182,13 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None):
                         "Received trainer_init_error from worker.",
                         "Setting objective to 1e12.",
                     )
-                else:
-                    print("Received None objective from worker. Skipping observation.")
             if objective is not None:
                 orion_exp.observe(
                     orion_trial,
                     [{"type": "objective", "name": "energy_mae", "value": objective}],
                 )
+            else:
+                print("Received None objective from worker. Skipping observation.")
 
     except Exception:
         error = True
diff --git a/ocpmodels/common/orion_utils.py b/ocpmodels/common/orion_utils.py
index 7f44cb5683..24a2f0a52e 100644
--- a/ocpmodels/common/orion_utils.py
+++ b/ocpmodels/common/orion_utils.py
@@ -82,6 +82,7 @@ def set_max_fidelity(hparams, orion_exp):
 
 def sample_orion_hparams(orion_exp, trainer_config):
     hparams = {}
+    orion_trial = None
     try:
         orion_trial = orion_exp.suggest(1)
         print(
@@ -110,7 +111,7 @@ def sample_orion_hparams(orion_exp, trainer_config):
                 wandb.run.tags = wandb.run.tags + ("RaceCondition",)
             else:
                 wandb.run.tags = ("RaceCondition",)
-    return hparams
+    return hparams, orion_trial
 
 
 def get_and_move_orion_db_path(exp_name):

From 2f64dbdbc458cd45f507b51346fcebbd0bef9bc7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 20:43:08 -0500
Subject: [PATCH 202/273] override max_epochs or steps from cli

---
 main.py                   | 17 +++++++----------
 ocpmodels/common/flags.py |  2 +-
 ocpmodels/common/utils.py | 38 ++++++++++++++++++++++++++++++++------
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/main.py b/main.py
index c7c5f30d45..01fc6c3c6d 100644
--- a/main.py
+++ b/main.py
@@ -27,7 +27,7 @@
     setup_imports,
     setup_logging,
     update_from_sbatch_py_vars,
-    set_hidden_channels,
+    set_min_hidden_channels,
 )
 from ocpmodels.common.orion_utils import (
     continue_orion_exp,
@@ -93,7 +93,6 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None):
     # -- Build config
 
     trainer_config = build_config(args, override_args)
-    original_trainer_config = copy.deepcopy(trainer_config)
 
     if args.distributed:
         dist_utils.setup(trainer_config)
@@ -119,14 +118,12 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None):
 
         if args.orion_exp_config_path and dist_utils.is_master():
             orion_exp = load_orion_exp(args)
+            hparams, orion_trial = sample_orion_hparams(orion_exp, trainer_config)
 
-        if dist_utils.is_master():
-            if orion_exp:
-                hparams, orion_trial = sample_orion_hparams(orion_exp, trainer_config)
-                if hparams.get("orion_race_condition"):
-                    logging.warning("\n\n ⛔️ Orion race condition. Stopping here.\n\n")
-                    wrap_up(args, start_time, error, signal)
-                    sys.exit()
+            if hparams.get("orion_race_condition"):
+                logging.warning("\n\n ⛔️ Orion race condition. Stopping here.\n\n")
+                wrap_up(args, start_time, error, signal)
+                sys.exit()
 
         hparams = dist_utils.broadcast_from_master(hparams)
         if hparams:
@@ -138,7 +135,7 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None):
 
         trainer_config = continue_orion_exp(trainer_config)
         trainer_config = auto_note(trainer_config)
-        trainer_config = set_hidden_channels(trainer_config)
+        trainer_config = set_min_hidden_channels(trainer_config)
 
         try:
             cls = registry.get_trainer_class(trainer_config["trainer"])
diff --git a/ocpmodels/common/flags.py b/ocpmodels/common/flags.py
index 6b35f7711a..1dd7e83701 100644
--- a/ocpmodels/common/flags.py
+++ b/ocpmodels/common/flags.py
@@ -99,7 +99,7 @@ def add_core_args(self):
             "--keep_orion_config",
             type=bool,
             help="If not True, any key in the continued/restarted config that contains"
-            + " ``orion`` will be set to ``None``",
+            + " ``orion`` or ``fidelity`` will be set to ``None``",
         )
         self.parser.add_argument(
             "--timestamp-id",
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index b344c639ff..8b3ca97fa5 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -904,7 +904,7 @@ def check_regress_forces(config):
             )
 
 
-def set_hidden_channels(config):
+def set_min_hidden_channels(config):
     # Embedding(
     #         85,
     #         hidden_channels
@@ -990,9 +990,15 @@ def build_config(args, args_override):
                 continue_config["checkpoint"] = str(latest_ckpt)
             continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
             if not args.keep_orion_config:
+                dels = {}
                 for k in continue_config:
-                    if "orion" in k:
+                    if "orion" in k or "fidelity" in k:
+                        dels[k] = copy.deepcopy(continue_config[k])
                         continue_config[k] = None
+                print(
+                    "Removing orion config from continue config. Set to None:",
+                    "{" + ", ".join([f"{k}: {v}->None" for k, v in dels.items()]) + "}",
+                )
             print(
                 f"✅ Loading config from directory {str(cont_dir)}"
                 + (
@@ -1021,14 +1027,34 @@ def build_config(args, args_override):
 
     if continue_config:
         new_dirs = [(k, v) for k, v in config.items() if "dir" in k]
-        # dataset_config = copy.deepcopy(config["dataset"])
         config = merge_dicts(
             continue_config,
             {k: resolve(v) if isinstance(v, str) else v for k, v in new_dirs},
         )
-        # config["dataset"] = dataset_config
-        config = merge_dicts(config, cli_args_dict())
-        config = merge_dicts(config, overrides)
+        cli = cli_args_dict()
+        if "max_steps" in cli.get("optim", {}):
+            if "max_epochs" in cli.get("optim", {}):
+                print(
+                    "Cannot set both `max_steps` and `max_epochs` from CLI.",
+                    " Using `max_steps`.",
+                )
+                del cli["optim"]["max_epochs"]
+            if "max_epochs" in config["optim"]:
+                print(
+                    f"Deleting max_epochs ({config['optim']['max_epochs']})",
+                    " because of `max_steps` from CLI.",
+                    "It will be reset by the Trainer.",
+                )
+                del config["optim"]["max_epochs"]
+        elif "max_epochs" in cli.get("optim", {}):
+            if "max_steps" in config["optim"]:
+                print(
+                    f"Deleting max_steps ({config['optim']['max_steps']})",
+                    " because of `max_epochs` from CLI.",
+                    "It will be reset by the Trainer.",
+                )
+                del config["optim"]["max_steps"]
+        config = merge_dicts(config, cli)
 
     check_regress_forces(config)
     config = set_cpus_to_workers(config)

From 15e1d5f1f1d84765ff1b9d0452a45795b303bd65 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Thu, 19 Jan 2023 20:49:20 -0500
Subject: [PATCH 203/273] add no confirm flag

---
 launch_exp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launch_exp.py b/launch_exp.py
index dd7d4110ad..7fa87240b8 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -243,7 +243,7 @@ def get_args_or_exp(key, args, exp):
     text += "\n<><><> Experiment config:\n\n-----" + exp_file.read_text() + "-----"
     text += "\n<><><> Experiment runs:\n\n • " + "\n\n  • ".join(commands) + separator
 
-    confirm = input("\n🚦 Confirm? [y/n] : ")
+    confirm = args.no_confirm or input("\n🚦 Confirm? [y/n] : ")
 
     if confirm == "y":
         try:

From 08d5e5e61b419945173ae09ca2b3854266f6dcef Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 10:21:06 -0500
Subject: [PATCH 204/273] fix qm coefs

---
 configs/exps/qm7x/schnet-fanet-lse.yaml | 55 +++++++++++++++++++------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml
index b101084196..8b6000ea9b 100644
--- a/configs/exps/qm7x/schnet-fanet-lse.yaml
+++ b/configs/exps/qm7x/schnet-fanet-lse.yaml
@@ -29,12 +29,12 @@ default:
     max_epochs: 100
     warmup_steps: 3000
     lr_initial: 0.0001
-    eval_every: 0.34
+    eval_every: 0.251
+    energy_coefficient: 0.01
+    energy_grad_coefficient: 0
+    force_coefficient: 0.99
     # parameters EMA
     ema_decay: 0.999
-    energy_coefficient: 0.
-    energy_grad_coefficient: 0
-    force_coefficient: 1.
     loss_energy: mae
     loss_force: mse
     # all below is for the scheduler
@@ -66,14 +66,8 @@ runs:
 
   - config: schnet-qm7x-all
     optim:
-      lr_initial: 0.001
       batch_size: 100
 
-  - config: schnet-qm7x-all
-    optim:
-      lr_initial: 0.001
-      batch_size: 256
-
   - config: fanet-qm7x-all
     model:
       graph_norm: true
@@ -82,9 +76,13 @@ runs:
 
   - config: fanet-qm7x-all
     optim:
-      lr_initial: 0.001
       batch_size: 100
+      energy_coefficient: 0.01
+      energy_grad_coefficient: 0.1
+      force_coefficient: 0.89
     model:
+      hidden_channels: 256
+      num_filters: 256
       graph_norm: true
       edge_embed_type: all_rij
       mp_type: updownscale_base
@@ -93,8 +91,40 @@ runs:
 
   - config: fanet-qm7x-all
     optim:
-      lr_initial: 0.001
       batch_size: 100
+      energy_coefficient: 0.01
+      energy_grad_coefficient: 0
+      force_coefficient: 0.99
+    model:
+      graph_norm: false
+      force_decoder_type: mlp
+      edge_embed_type: all_rij
+      regress_forces: direct
+      mp_type: updownscale_base
+      num_interactions: 4
+      regress_forces: direct
+
+  - config: fanet-qm7x-all
+    optim:
+      batch_size: 100
+      energy_coefficient: 0.01
+      energy_grad_coefficient: 0
+      force_coefficient: 0.99
+    model:
+      hidden_channels: 256
+      num_filters: 256
+      graph_norm: true
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
+      force_decoder_type: mlp
+      regress_forces: from_energy
+
+  - config: fanet-qm7x-all
+    optim:
+      batch_size: 100
+      energy_coefficient: 1
+      energy_grad_coefficient: 0
+      force_coefficient: 0
     model:
       graph_norm: false
       force_decoder_type: mlp
@@ -102,3 +132,4 @@ runs:
       regress_forces: direct
       mp_type: updownscale_base
       num_interactions: 4
+      regress_forces: ""

From 1733dc3860545cd7fc9a5937121f48a27d63a81f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 16:26:27 -0500
Subject: [PATCH 205/273] qm9 lse

---
 .../qm9-metadata/lse-shifts-pre-attr.json     | 230 ++++++++++++++++++
 configs/models/tasks/qm9.yaml                 |   3 +
 ocpmodels/datasets/qm9.py                     |  23 +-
 scripts/compute_qm9_lse.py                    |  44 ++++
 4 files changed, 299 insertions(+), 1 deletion(-)
 create mode 100644 configs/models/qm9-metadata/lse-shifts-pre-attr.json
 create mode 100644 scripts/compute_qm9_lse.py

diff --git a/configs/models/qm9-metadata/lse-shifts-pre-attr.json b/configs/models/qm9-metadata/lse-shifts-pre-attr.json
new file mode 100644
index 0000000000..86f4af4829
--- /dev/null
+++ b/configs/models/qm9-metadata/lse-shifts-pre-attr.json
@@ -0,0 +1,230 @@
+[
+    [
+        0.0,
+        -0.10982761652979106,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.32071860969393706,
+        0.7727475754212988,
+        0.6122788803796335,
+        0.12181916029627653
+    ],
+    [
+        0.0,
+        1.0966529080621437,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        8.345193465915363,
+        6.7224998615447875,
+        3.732818145439648,
+        2.2936289582074605
+    ],
+    [
+        0.0,
+        0.0077456863520091205,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -0.7250601459571974,
+        -0.6989512318127282,
+        -0.8863108915238633,
+        -0.9922779353470085
+    ],
+    [
+        0.0,
+        0.30021842422367884,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -0.26317262369533545,
+        -0.2875368455127921,
+        -0.3500397347350114,
+        0.04404252090948248
+    ],
+    [
+        0.0,
+        0.2924707296899182,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.46189245571758497,
+        0.4114099985995442,
+        0.5362623401609063,
+        1.036313993269014
+    ],
+    [
+        0.0,
+        47.45816316610576,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        70.0750144837597,
+        124.4791335864648,
+        127.86854197433897,
+        138.72469086321337
+    ],
+    [
+        0.0,
+        0.3121385900756226,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.1369104053751863,
+        0.1395331622284403,
+        0.11435811860274497,
+        0.0927858357374155
+    ],
+    [
+        0.0,
+        -16.42979788627529,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -1036.0476484603748,
+        -1489.8018410118,
+        -2046.9839395287415,
+        -2717.500731519501
+    ],
+    [
+        0.0,
+        -16.41971726970345,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -1036.0357739433853,
+        -1489.7785840232264,
+        -2046.957058538375,
+        -2717.4729766865266
+    ],
+    [
+        0.0,
+        -16.41966403449024,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -1036.032953979038,
+        -1489.7756271261542,
+        -2046.9541183747576,
+        -2717.4699865136436
+    ],
+    [
+        0.0,
+        -16.442885376380016,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -1036.1303574932683,
+        -1489.905244199318,
+        -2047.0931070097215,
+        -2717.611776049197
+    ],
+    [
+        0.0,
+        1.2395997052342684,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        2.0381439844712617,
+        2.786144517849412,
+        3.081733026508292,
+        3.3493991981514926
+    ],
+    [
+        0.0,
+        -2.816675503921841,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -6.184539526467787,
+        -4.499344570456945,
+        -4.372725898402976,
+        -4.015912118288816
+    ],
+    [
+        0.0,
+        -2.8451268608519213,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -6.211222454315244,
+        -4.514619577859387,
+        -4.384377408978144,
+        -4.026689035403237
+    ],
+    [
+        0.0,
+        -2.8707881825750228,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -6.234090510252649,
+        -4.537349782429693,
+        -4.40712339763125,
+        -4.049381712818243
+    ],
+    [
+        0.0,
+        -2.539854050015558,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        -5.871457077318331,
+        -4.193597910926504,
+        -4.075001860101352,
+        -3.723844087894856
+    ],
+    [
+        0.0,
+        -12.07141210715022,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        21.450944711624825,
+        -6.003425192133964,
+        -7.458598998696279,
+        -12.120394582901548
+    ],
+    [
+        0.0,
+        -0.03777062245051535,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.21317817854549387,
+        0.1600228078641471,
+        0.1495886265915201,
+        0.1495129174010618
+    ],
+    [
+        0.0,
+        -0.023858095381470663,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.16957282642751406,
+        0.09736248987285331,
+        0.10686749266146903,
+        0.08566440464961594
+    ]
+]
\ No newline at end of file
diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml
index e53c071188..42b4256a1e 100644
--- a/configs/models/tasks/qm9.yaml
+++ b/configs/models/tasks/qm9.yaml
@@ -30,6 +30,7 @@ default:
       target: 12 # predict atomization energy at 0K at index 12
       seed: 123
       normalize_labels: True # mean and std of target will be set by qm9.py if this is True
+      lse_shift: false
       indices:
         start: 0
         end: 110000
@@ -37,6 +38,7 @@ default:
       src: /network/projects/ocp/qm9
       target: 12 # predict atomization energy at 0K at index 12
       seed: 123
+      lse_shift: false
       indices:
         start: 110000
         end: 120000
@@ -44,6 +46,7 @@ default:
       src: /network/projects/ocp/qm9
       target: 12 # predict atomization energy at 0K at index 12
       seed: 123
+      lse_shift: false
       indices:
         start: 120000
         end: -1
diff --git a/ocpmodels/datasets/qm9.py b/ocpmodels/datasets/qm9.py
index 3a5930569b..c0882761a1 100644
--- a/ocpmodels/datasets/qm9.py
+++ b/ocpmodels/datasets/qm9.py
@@ -1,11 +1,12 @@
 from pathlib import Path
 
 import time
-
+import json
 import torch
 from torch_geometric.datasets import QM9
 
 from ocpmodels.common.registry import registry
+from ocpmodels.common.utils import ROOT
 
 
 @registry.register_dataset("qm9")
@@ -61,6 +62,20 @@ def __init__(
         else:
             self.samples = self.perm[start:end]
 
+        self.lse_shifts = None
+        if self.config.get("lse_shift"):
+            self.lse_shifts = torch.tensor(
+                json.loads(
+                    (
+                        ROOT
+                        / "configs"
+                        / "models"
+                        / "qm9-metadata"
+                        / "lse-shifts-pre-attr.json"
+                    ).read_text()
+                )
+            )
+
     def close_db(self):
         pass
 
@@ -76,6 +91,12 @@ def __getitem__(self, idx):
         data.cell_offsets = torch.zeros((data.edge_index.shape[1], 3))
         del data.z
         data.tags = torch.full((data.natoms,), -1, dtype=torch.long)
+
+        if self.lse_shifts is not None:
+            data.lse_shift = self.lse_shifts[self.target][data.atomic_numbers].sum()
+            data.y_unshifted = data.y
+            data.y = data.y - data.lse_shift
+
         t1 = time.time_ns()
         if self._transform is not None:
             data = self._transform(data)
diff --git a/scripts/compute_qm9_lse.py b/scripts/compute_qm9_lse.py
new file mode 100644
index 0000000000..268f1c2caa
--- /dev/null
+++ b/scripts/compute_qm9_lse.py
@@ -0,0 +1,44 @@
+import json
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+from sklearn.feature_extraction import DictVectorizer
+from torch_geometric.datasets import QM9
+
+
+def count_fn(y):
+    return dict(zip(*np.unique(y, return_counts=True)))
+
+
+if __name__ == "__main__":
+    # from  SO3Krates
+    # https://github.com/thorben-frank/mlff/blob/v0.1/mlff/src/data/preprocessing.py#L297
+    base = Path("/network/projects/ocp/qm9")
+    ds = QM9(base)
+
+    shifts_per_attr = []
+
+    for attr in tqdm(range(ds[0].y.shape[-1])):
+
+        data = [(d.y[0, attr].numpy(), d.z) for d in ds]
+        q = np.array([d[0] for d in data])
+        max_n_atoms = max([len(d[1]) for d in data])
+        z = np.array([np.pad(d[1], (0, max_n_atoms - len(d[1]))) for d in data])
+        u = np.unique(z)
+        idx_ = u != 0  # remove padding with 0
+        lhs_counts = list(map(count_fn, z))
+        v = DictVectorizer(sparse=False)
+        X = v.fit_transform(lhs_counts)
+        X = X[..., idx_]
+
+        sol = np.linalg.lstsq(X, q, rcond=None)
+        shifts = np.zeros(np.max(u) + 1)
+        for k, v in dict(zip(u[idx_], sol[0])).items():
+            shifts[k] = v
+        shifts_per_attr.append(shifts.tolist())
+
+    j_dir = (
+        Path(__file__).resolve().parent.parent / "configs" / "models" / "qm9-metadata"
+    )
+    j_dir.mkdir(parents=True, exist_ok=True)
+    (j_dir / "lse-shifts-pre-attr.json").write_text(json.dumps(shifts_per_attr))

From 296ee5c32864306338c7dbda35d9144cd248db83 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 16:26:48 -0500
Subject: [PATCH 206/273] merge val_ood/train

---
 configs/models/tasks/qm7x.yaml | 21 +++++++++++----------
 ocpmodels/datasets/qm7x.py     |  6 +++++-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml
index 3408bf5b8f..10919390a2 100644
--- a/configs/models/tasks/qm7x.yaml
+++ b/configs/models/tasks/qm7x.yaml
@@ -38,6 +38,7 @@ default:
       std_divider: 1.0
       rescale_with_hof: False
       lse_shift: True
+      include_val_ood: True
     val_id:
       src: /network/projects/ocp/qm7x/processed
       normalize_labels: True # mean and std of target will be set by utils.py if this is True
@@ -46,23 +47,23 @@ default:
       forces_target: totFOR
       std_divider: 1.0
       lse_shift: True
-    val_ood:
+    # val_ood:
+    #   src: /network/projects/ocp/qm7x/processed
+    #   normalize_labels: True # mean and std of target will be set by utils.py if this is True
+    #   split: val_ood
+    #   target: ePBE0+MBD
+    #   forces_target: totFOR
+    #   std_divider: 1.0
+    #   lse_shift: True
+    test:
       src: /network/projects/ocp/qm7x/processed
       normalize_labels: True # mean and std of target will be set by utils.py if this is True
-      split: val_ood
+      split: test
       target: ePBE0+MBD
       forces_target: totFOR
       std_divider: 1.0
       lse_shift: True
 
-    # TEST SET DO NOT ENABLE
-    # - src: /network/projects/ocp/qm9
-    #   target: 7 # predict internal energy at 0K at index 7
-    #   seed: 123
-    #   ratio:
-    #     start: 0.85
-    #     end: 1.0
-
 # ^`target` is a string to select the target to predict as per
 # https://arxiv.org/abs/2006.15139 Table 2
 
diff --git a/ocpmodels/datasets/qm7x.py b/ocpmodels/datasets/qm7x.py
index 97c3b85225..7ecb7e0bf3 100644
--- a/ocpmodels/datasets/qm7x.py
+++ b/ocpmodels/datasets/qm7x.py
@@ -754,9 +754,13 @@ def __init__(
             split in all_samples["splits"]
         ), f"split {split} not found in sample mapping"
 
+        sample_ids = all_samples["splits"][split]
+        if self.config.get("include_val_ood"):
+            sample_ids = sorted(sample_ids + all_samples["splits"]["val_ood"])
+
         self.keys = [
             f'{all_samples["structures"][i][0]}-{all_samples["structures"][i][1]}'
-            for i in all_samples["splits"][split]
+            for i in sample_ids
         ]
 
         self.hofs = fetch_table("elements")["heat_of_formation"].values

From 7f3041696b51a190ca6172636f7b21bf78f285f1 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 16:27:00 -0500
Subject: [PATCH 207/273] check lse qm9

---
 ocpmodels/common/utils.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 8b3ca97fa5..10afc56ce8 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -306,6 +306,13 @@ def set_qm9_target_stats(trainer_config):
             continue
         if not dataset.get("normalize_labels", False):
             continue
+        elif dataset.get("lse_shift"):
+            print(
+                "Setting normalize_labels to False because of lse_shift for split",
+                f"{d}.",
+            )
+            trainer_config["dataset"][d]["normalize_labels"] = False
+            continue
         assert "target" in dataset
         mean = target_means[dataset["target"]]
         std = target_stds[dataset["target"]]
@@ -354,14 +361,13 @@ def set_qm7x_target_stats(trainer_config):
             continue
         if not dataset.get("normalize_labels", False):
             continue
-        else:
-            if dataset.get("lse_shift"):
-                print(
-                    "Setting normalize_labels to False because of lse_shift for split",
-                    f"{d}.",
-                )
-                trainer_config["dataset"][d]["normalize_labels"] = False
-                continue
+        elif dataset.get("lse_shift"):
+            print(
+                "Setting normalize_labels to False because of lse_shift for split",
+                f"{d}.",
+            )
+            trainer_config["dataset"][d]["normalize_labels"] = False
+            continue
 
         assert "target" in dataset, "target must be specified."
         mean = target_stats[dataset["target"]]["mean"]

From dd0816c07ad61675bfe5c0c19b50f85a68dab84f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 16:27:07 -0500
Subject: [PATCH 208/273] format

---
 configs/models/qm7x-metadata/lse-shifts.json | 21 +++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/configs/models/qm7x-metadata/lse-shifts.json b/configs/models/qm7x-metadata/lse-shifts.json
index 8893e2a5cc..7a002ce3dc 100644
--- a/configs/models/qm7x-metadata/lse-shifts.json
+++ b/configs/models/qm7x-metadata/lse-shifts.json
@@ -1 +1,20 @@
-[0.0, -16.48365429710017, 0.0, 0.0, 0.0, 0.0, -1035.230325647512, -1488.1741712581756, -2045.3532693858685, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -10832.70108036143, -12520.741665730922]
\ No newline at end of file
+[
+    0.0,
+    -16.48365429710017,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    -1035.230325647512,
+    -1488.1741712581756,
+    -2045.3532693858685,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    0.0,
+    -10832.70108036143,
+    -12520.741665730922
+]
\ No newline at end of file

From e10fd1470e2b6fbc34adf0dbd9365ac627e202a6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 16:27:48 -0500
Subject: [PATCH 209/273] store forces_grad_target even with direct
 regress_forces

---
 ocpmodels/models/base_model.py       |  6 +++---
 ocpmodels/trainers/single_trainer.py | 13 +++++++------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/ocpmodels/models/base_model.py b/ocpmodels/models/base_model.py
index cef787d8bd..59648e902b 100644
--- a/ocpmodels/models/base_model.py
+++ b/ocpmodels/models/base_model.py
@@ -61,9 +61,9 @@ def forward(self, data):
             elif self.regress_forces in {"direct", "direct_with_gradient_target"}:
                 # predicted forces are the model's direct forces
                 preds["forces"] = forces
-                if self.regress_forces == "direct_with_gradient_target":
-                    # store the energy gradient as the target
-                    preds["forces_grad_target"] = grad_forces.detach()
+                # store the energy gradient as the target. Used for metrics
+                # only in "direct" mode.
+                preds["forces_grad_target"] = grad_forces.detach()
             else:
                 raise ValueError(
                     f"Unknown forces regression mode {self.regress_forces}"
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index bdb5a8908f..36d11a1d73 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -541,16 +541,17 @@ def compute_loss(self, preds, batch_list):
                 )
                 loss["total_loss"].append(force_mult * loss["force_loss"])
                 if "forces_grad_target" in preds:
-                    energy_grad_mult = self.config["optim"].get(
-                        "energy_grad_coefficient", 10
-                    )
                     grad_target = preds["forces_grad_target"]
                     loss["energy_grad_loss"] = self.loss_fn["force"](
                         preds["forces"][mask], grad_target[mask]
                     )
-                    loss["total_loss"].append(
-                        energy_grad_mult * loss["energy_grad_loss"]
-                    )
+                    if self.model.regress_forces == "direct_with_energy_grad":
+                        energy_grad_mult = self.config["optim"].get(
+                            "energy_grad_coefficient", 10
+                        )
+                        loss["total_loss"].append(
+                            energy_grad_mult * loss["energy_grad_loss"]
+                        )
         # Sanity check to make sure the compute graph is correct.
         for lc in loss["total_loss"]:
             assert hasattr(lc, "grad_fn")

From 52cd6d96d911134c20a915a2ab97e81985bca773 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 16:27:52 -0500
Subject: [PATCH 210/273] new exps

---
 configs/exps/icml/qm9/fanet-manual-lse.yaml | 115 +++++++++++++++++++
 configs/exps/qm7x/schnet-fanet-lse.yaml     | 121 ++++++++++++++------
 2 files changed, 203 insertions(+), 33 deletions(-)
 create mode 100644 configs/exps/icml/qm9/fanet-manual-lse.yaml

diff --git a/configs/exps/icml/qm9/fanet-manual-lse.yaml b/configs/exps/icml/qm9/fanet-manual-lse.yaml
new file mode 100644
index 0000000000..877dc62249
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-manual-lse.yaml
@@ -0,0 +1,115 @@
+# scheduler reduce lr on plateau
+job:
+  mem: 12GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, fanet-qm9-lse
+  log_train_every: 200
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type
+    optim: batch_size, lr_initial
+  frame_averaging: 3D
+  fa_frames: random
+  dataset:
+    train:
+      lse_shift: true
+    val:
+      lse_shift: true
+    test:
+      lse_shift: true
+  optim:
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    batch_size: 64
+    initial_lr: 0.0005
+    max_epochs: 1500
+    loss_energy: mse
+    loss_force: mse
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.000001
+    es_warmup_epochs: 500
+    # all below is for the ReduceLROnPlateau scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+    patience: 15
+  model:
+    cutoff: 5.0
+    edge_embed_type: all_rij
+    energy_head: ""
+    graph_norm: True
+    hidden_channels: 160
+    num_filters: 160
+    max_num_neighbors: 30
+    mp_type: updownscale_bae
+    num_gaussians: 50
+    num_interactions: 4
+    otf_graph: false
+    pg_hidden_channels: 32
+    phys_embeds: true
+    phys_hidden_channels: 0
+    second_layer_MLP: true
+    skip_co: true
+    tag_hidden_channels: 0
+    use_pbc: false
+    regress_forces: ""
+
+
+runs:
+  - {}
+  - model:
+      cutoff: 5.0
+      edge_embed_type: all_rij
+      energy_head: ""
+      graph_norm: false
+      hidden_channels: 96
+      max_num_neighbors: 30
+      mp_type: updownscale_bae
+      num_filters: 224
+      num_gaussians: 128
+      num_interactions: 4
+      phys_embeds: false
+      second_layer_MLP: false
+      skip_co: false
+  - model:
+      cutoff: 5.0
+      edge_embed_type: all_rij
+      energy_head: ""
+      graph_norm: true
+      hidden_channels: 110
+      max_num_neighbors: 40
+      mp_type: updownscale_bae
+      num_filters: 384
+      num_gaussians: 64
+      num_interactions: 4
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+  - model:
+      cutoff: 6.0
+      edge_embed_type: all_rij
+      energy_head: ""
+      graph_norm: true
+      hidden_channels: 110
+      max_num_neighbors: 30
+      mp_type: updownscale
+      num_filters: 192
+      num_gaussians: 128
+      num_interactions: 5
+      phys_embeds: true
+      second_layer_MLP: false
+      skip_co: true
diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml
index 8b6000ea9b..423440f996 100644
--- a/configs/exps/qm7x/schnet-fanet-lse.yaml
+++ b/configs/exps/qm7x/schnet-fanet-lse.yaml
@@ -4,7 +4,7 @@ job:
   cpus: 4
   gres: gpu:16gb:1
   partition: long
-  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-1
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
   env: ocp-a100
 
 default:
@@ -22,17 +22,17 @@ default:
   cp_data_to_tmpdir: true
   note:
     task: name
-    model: name, num_gaussians, hidden_channels, num_filters, num_interactions
-    optim: batch_size, lr_initial
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces
+    optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
   optim:
     batch_size: 10
-    max_epochs: 100
+    max_steps: 2000000
     warmup_steps: 3000
     lr_initial: 0.0001
     eval_every: 0.251
-    energy_coefficient: 0.01
+    energy_coefficient: 1
     energy_grad_coefficient: 0
-    force_coefficient: 0.99
+    force_coefficient: 100
     # parameters EMA
     ema_decay: 0.999
     loss_energy: mae
@@ -63,26 +63,45 @@ default:
 
 runs:
   - config: schnet-qm7x-all
-
+  - config: schnet-qm7x-all
+    model:
+      regress_forces: ""
+  - config: schnet-qm7x-all
+    optim:
+      energy_coefficient: 0
   - config: schnet-qm7x-all
     optim:
       batch_size: 100
+      max_steps: 1000000
+      lr_initial: 0.0003
 
   - config: fanet-qm7x-all
+    optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
     model:
+      num_interactions: 4
+      hidden_channels: 160
+      num_filters: 160
       graph_norm: true
       edge_embed_type: all_rij
       mp_type: updownscale_base
+      force_decoder_type: mlp
+      regress_forces: direct_with_gradient_target
 
   - config: fanet-qm7x-all
     optim:
-      batch_size: 100
-      energy_coefficient: 0.01
-      energy_grad_coefficient: 0.1
-      force_coefficient: 0.89
+      batch_size: 50
+      max_steps: 1000000
+      initial_lr: 0.0005
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
     model:
-      hidden_channels: 256
-      num_filters: 256
+      num_interactions: 4
+      hidden_channels: 160
+      num_filters: 160
       graph_norm: true
       edge_embed_type: all_rij
       mp_type: updownscale_base
@@ -92,44 +111,80 @@ runs:
   - config: fanet-qm7x-all
     optim:
       batch_size: 100
-      energy_coefficient: 0.01
-      energy_grad_coefficient: 0
-      force_coefficient: 0.99
+      initial_lr: 0.001
+      max_steps: 1000000
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
     model:
-      graph_norm: false
-      force_decoder_type: mlp
+      num_interactions: 4
+      hidden_channels: 160
+      num_filters: 160
+      graph_norm: true
       edge_embed_type: all_rij
-      regress_forces: direct
       mp_type: updownscale_base
-      num_interactions: 4
-      regress_forces: direct
+      force_decoder_type: mlp
+      regress_forces: direct_with_gradient_target
 
   - config: fanet-qm7x-all
     optim:
-      batch_size: 100
-      energy_coefficient: 0.01
+      batch_size: 50
+      initial_lr: 0.0005
+      max_steps: 1000000
+      energy_coefficient: 1
       energy_grad_coefficient: 0
-      force_coefficient: 0.99
+      force_coefficient: 100
     model:
-      hidden_channels: 256
-      num_filters: 256
+      num_interactions: 4
+      hidden_channels: 160
+      num_filters: 160
       graph_norm: true
       edge_embed_type: all_rij
       mp_type: updownscale_base
       force_decoder_type: mlp
-      regress_forces: from_energy
+      regress_forces: direct
 
   - config: fanet-qm7x-all
     optim:
-      batch_size: 100
+      batch_size: 50
+      initial_lr: 0.0005
+      max_steps: 1000000
       energy_coefficient: 1
       energy_grad_coefficient: 0
-      force_coefficient: 0
+      force_coefficient: 100
     model:
-      graph_norm: false
-      force_decoder_type: mlp
+      num_interactions: 4
+      hidden_channels: 160
+      num_filters: 160
+      graph_norm: true
       edge_embed_type: all_rij
+      mp_type: simple
+      complex_mp: true
+      second_layer_mlp: true
+      force_decoder_type: mlp
       regress_forces: direct
-      mp_type: updownscale_base
+
+  - config: fanet-qm7x-all
+    optim:
+      batch_size: 50
+      initial_lr: 0.0005
+      max_steps: 1000000
+      energy_coefficient: 0
+      energy_grad_coefficient: 0
+      force_coefficient: 1
+    model:
       num_interactions: 4
-      regress_forces: ""
+      hidden_channels: 160
+      num_filters: 160
+      graph_norm: true
+      edge_embed_type: all_rij
+      mp_type: updownscale_base
+      force_decoder_type: mlp
+      regress_forces: direct
+
+
+
+
+
+
+

From 3345a29f987aa8f603bdf03cbd632e90324a7eea Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 17:34:50 -0500
Subject: [PATCH 211/273] fix energy grad logging

---
 ocpmodels/models/base_model.py       | 18 ++++++++----------
 ocpmodels/trainers/base_trainer.py   |  4 ++--
 ocpmodels/trainers/single_trainer.py |  5 +++--
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/ocpmodels/models/base_model.py b/ocpmodels/models/base_model.py
index 59648e902b..3a0ac3a93f 100644
--- a/ocpmodels/models/base_model.py
+++ b/ocpmodels/models/base_model.py
@@ -34,9 +34,8 @@ def forces_forward(self, preds):
     def forward(self, data):
         grad_forces = forces = None
 
-        if self.regress_forces in {"from_energy", "direct_with_gradient_target"}:
-            # energy gradient w.r.t. positions will be computed
-            data.pos.requires_grad_(True)
+        # energy gradient w.r.t. positions will be computed
+        data.pos.requires_grad_(True)
 
         # predict energy
         preds = self.energy_forward(data)
@@ -47,13 +46,12 @@ def forward(self, data):
                 # predict forces
                 forces = self.forces_forward(preds)
 
-            if self.regress_forces in {"from_energy", "direct_with_gradient_target"}:
-                if "gemnet" in self.__class__.__name__.lower():
-                    # gemnet forces are already computed
-                    grad_forces = forces
-                else:
-                    # compute forces from energy gradient
-                    grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"])
+            if "gemnet" in self.__class__.__name__.lower():
+                # gemnet forces are already computed
+                grad_forces = forces
+            else:
+                # compute forces from energy gradient
+                grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"])
 
             if self.regress_forces == "from_energy":
                 # predicted forces are the energy gradient
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 901908345a..745fcf06d9 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -610,7 +610,7 @@ def validate(
     ):
         if dist_utils.is_master() and not self.silent:
             print()
-            logging.info(f"🧐 Evaluating on {split}.")
+            logging.info(f"\n >>> 🧐 Evaluating on {split}.")
         if self.is_hpo:
             disable_tqdm = True
 
@@ -679,7 +679,7 @@ def validate(
 
         if dist_utils.is_master() and not self.silent:
             log_str = ["{}: {:.4f}".format(k, v) for k, v in log_dict.items()]
-            print("\n  > ".join([""] + log_str))
+            print(("\n  > ".join([""] + log_str))[1:])
             print()
 
         # Make plots.
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 36d11a1d73..c8a35a7e90 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -545,7 +545,7 @@ def compute_loss(self, preds, batch_list):
                     loss["energy_grad_loss"] = self.loss_fn["force"](
                         preds["forces"][mask], grad_target[mask]
                     )
-                    if self.model.regress_forces == "direct_with_energy_grad":
+                    if self.model.module.regress_forces == "direct_with_energy_grad":
                         energy_grad_mult = self.config["optim"].get(
                             "energy_grad_coefficient", 10
                         )
@@ -655,7 +655,8 @@ def log_train_metrics(self, end_of_epoch=False):
             if not self.silent:
                 log_str = ["{}: {:.2e}".format(k, v) for k, v in log_dict.items()]
                 print(
-                    f"Train metrics at step {self.step}:\n  > " + "\n  > ".join(log_str)
+                    f"\nTrain metrics at step {self.step}:\n  > "
+                    + "\n  > ".join(log_str)
                 )
             self.metrics = {}
 

From a0cd09d065636765f66b1cba7d7eda9ac38eff43 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 19:49:19 -0500
Subject: [PATCH 212/273] typo

---
 scripts/compute_is2re_lse.py | 47 ++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 scripts/compute_is2re_lse.py

diff --git a/scripts/compute_is2re_lse.py b/scripts/compute_is2re_lse.py
new file mode 100644
index 0000000000..924a8d6c53
--- /dev/null
+++ b/scripts/compute_is2re_lse.py
@@ -0,0 +1,47 @@
+import json
+from pathlib import Path
+import h5py
+from tqdm import tqdm
+import numpy as np
+from sklearn.feature_extraction import DictVectorizer
+import sys
+
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+
+from ocpmodels.datasets.lmdb_dataset import LmdbDataset
+
+
+def count_fn(y):
+    return dict(zip(*np.unique(y, return_counts=True)))
+
+
+if __name__ == "__main__":
+    # from  SO3Krates
+    # https://github.com/thorben-frank/mlff/blob/v0.1/mlff/src/data/preprocessing.py#L297
+    ds = LmdbDataset({"src": "/network/projects/ocp/oc20/is2re/all/train/"})
+    data = [(d["y"], d["atomic_numbers"]) for d in tqdm(ds, total=len(ds))]
+
+    q = np.array([d[0].item() for d in data])
+    max_n_atoms = max([len(d[1]) for d in data])
+    z = np.array([np.pad(d[1], (0, max_n_atoms - len(d[1]))) for d in data])
+    u = np.unique(z)
+    idx_ = u != 0  # remove padding with 0
+    lhs_counts = list(map(count_fn, z))
+    v = DictVectorizer(sparse=False)
+    X = v.fit_transform(lhs_counts)
+    X = X[..., idx_]
+
+    sol = np.linalg.lstsq(X, q, rcond=None)
+    shifts = np.zeros(np.max(u) + 1)
+    for k, v in dict(zip(u[idx_], sol[0])).items():
+        shifts[k] = v
+
+    (
+        Path("/home/mila/s/schmidtv/ocp-project/ocp-drlab")
+        / "configs"
+        / "models"
+        / "is2re-metadata"
+        / "lse-shifts.json"
+    ).write_text(json.dumps(shifts.tolist()))
+
+    q_shifts = shifts[z].sum(-1)

From d7e720688390c54d1c074eefe2394bfaa8f658ec Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Fri, 20 Jan 2023 19:49:48 -0500
Subject: [PATCH 213/273] typo

---
 ocpmodels/common/exp_manager.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index 6a2bead459..fb2005a7b3 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -13,7 +13,8 @@
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 
-from ocpmodels.common.utils import ROOT, RUN_DIR, get_and_move_orion_db_path
+from ocpmodels.common.utils import ROOT, RUN_DIR
+from ocpmodels.common.orion_utils import get_and_move_orion_db_path
 
 EXP_OUT_DIR = ROOT / "data" / "exp_outputs"
 MANAGER_CACHE = ROOT / "data" / "exp_manager_cache"

From c66c4e1b640d621907578ff189fd7173cce305d4 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 13:15:41 -0500
Subject: [PATCH 214/273] sbatch resume

---
 configs/sbatch/defaults.yaml    |  3 ++
 main.py                         |  2 +-
 ocpmodels/common/exp_manager.py |  2 +-
 ocpmodels/common/utils.py       | 28 +++++++++++-----
 sbatch.py                       | 59 +++++++++++++++++++++++++++++++--
 5 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/configs/sbatch/defaults.yaml b/configs/sbatch/defaults.yaml
index 97afac49df..d00797d755 100644
--- a/configs/sbatch/defaults.yaml
+++ b/configs/sbatch/defaults.yaml
@@ -28,6 +28,9 @@ code_loc: null # code location. Defaults to the current repository path
 output: "$SCRATCH/ocp/runs/%j/output-%t.txt" # slurm output file per task (%t)
 logdir: "$SCRATCH/ocp/runs/$SLURM_JOB_ID" # --logdir value for main.py, appended to py_args if not already present
 
+continue_from_dir: null
+restart_from_dir: null
+
 env: "ocp" # env name for `conda activate {env}`
 py_args: "" # arguments for main.py
 note: "" # wandb run note
diff --git a/main.py b/main.py
index 01fc6c3c6d..cb81c7fb61 100644
--- a/main.py
+++ b/main.py
@@ -109,7 +109,7 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None):
     # -- Initial setup
 
     setup_imports()
-    print("All things imported.\n")
+    print("\n🚩 All things imported.\n")
     start_time = time.time()
 
     try:
diff --git a/ocpmodels/common/exp_manager.py b/ocpmodels/common/exp_manager.py
index fb2005a7b3..0106863cd0 100644
--- a/ocpmodels/common/exp_manager.py
+++ b/ocpmodels/common/exp_manager.py
@@ -235,7 +235,7 @@ def parse_output_files(self):
             elif "srun: Job step aborted" in out_txt:
                 if "slurmstepd" in out_txt and " CANCELLED AT " in out_txt:
                     self.cache["job_state"][j] = "Cancelled"
-            elif "nan_loss" in out_txt:
+            elif "Loss is NaN. Stopping training." in out_txt:
                 self.cache["job_state"][j] = "NaN loss"
             else:
                 self.cache["job_state"][j] = "Unknown"
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 10afc56ce8..57e91e2944 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -164,7 +164,7 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config):
     ):
         return trainer_config
 
-    print("\nMoving data to slurm tmpdir", flush=True)
+    print("\n🚉 Copying data to slurm tmpdir", flush=True)
 
     tmp_dir = os.environ.get("SLURM_TMPDIR") or f"/Tmp/slurm.{JOB_ID}.0"
     tmp_dir = Path(tmp_dir)
@@ -177,21 +177,21 @@ def move_lmdb_data_to_slurm_tmpdir(trainer_config):
         new_dir = tmp_dir / original.name
         if new_dir.exists():
             print(
-                f"Data already copied to {str(new_dir)} for split",
+                f"   Data already copied to {str(new_dir)} for split",
                 f"{s} with source path {split['src']}",
                 flush=True,
             )
             trainer_config["dataset"][s]["src"] = str(new_dir)
             continue
-        print("Making new_dir: ", str(new_dir), flush=True)
+        print("   Making new_dir: ", str(new_dir), flush=True)
         new_dir.mkdir()
         command = ["cp", "-r", f"{str(original)}", str(new_dir.parent)]
-        print("Copying data: ", " ".join(command), flush=True)
+        print("   Copying data: ", " ".join(command), flush=True)
         subprocess.run(command)
         for f in new_dir.glob("*.lmdb-lock"):
             f.unlink()
         trainer_config["dataset"][s]["src"] = str(new_dir)
-        print("Done moving data to", str(new_dir), flush=True)
+        print("   Done moving data to", str(new_dir), flush=True)
     return trainer_config
 
 
@@ -877,7 +877,7 @@ def set_cpus_to_workers(config):
                 workers = cpus // gpus
             if not config["silent"]:
                 print(
-                    f"Overriding num_workers from {config['optim']['num_workers']}",
+                    f"🏭 Overriding num_workers from {config['optim']['num_workers']}",
                     f"to {workers} to match the machine's CPUs.",
                     "Use --no_cpus_to_workers=true to disable this behavior.",
                 )
@@ -1002,7 +1002,7 @@ def build_config(args, args_override):
                         dels[k] = copy.deepcopy(continue_config[k])
                         continue_config[k] = None
                 print(
-                    "Removing orion config from continue config. Set to None:",
+                    "🅾️  Removing orion config from continue config. Set to None:",
                     "{" + ", ".join([f"{k}: {v}->None" for k, v in dels.items()]) + "}",
                 )
             print(
@@ -1032,11 +1032,23 @@ def build_config(args, args_override):
     config["world_size"] = args.num_nodes * args.num_gpus
 
     if continue_config:
-        new_dirs = [(k, v) for k, v in config.items() if "dir" in k]
+        new_dirs = [
+            (k, v) for k, v in config.items() if "dir" in k and k != "cp_data_to_tmpdir"
+        ]
+        data_srcs = copy.deepcopy(
+            {
+                k: {
+                    "src": v["src"]
+                }  # keep original src, if data was moved in the resumed exp
+                for k, v in config["dataset"].items()
+                if isinstance(v, dict) and "src" in v
+            }
+        )
         config = merge_dicts(
             continue_config,
             {k: resolve(v) if isinstance(v, str) else v for k, v in new_dirs},
         )
+        config["dataset"] = merge_dicts(config["dataset"], data_srcs)
         cli = cli_args_dict()
         if "max_steps" in cli.get("optim", {}):
             if "max_epochs" in cli.get("optim", {}):
diff --git a/sbatch.py b/sbatch.py
index bf17bd5c6f..b7a4d174d7 100644
--- a/sbatch.py
+++ b/sbatch.py
@@ -1,4 +1,4 @@
-from minydra import resolved_args
+from minydra import resolved_args, MinyDict
 from pathlib import Path
 from datetime import datetime
 import os
@@ -77,7 +77,9 @@ def discover_minydra_defaults():
     user_config = root / "configs" / "sbatch" / f"{os.environ['USER']}.yaml"
     if user_config.exists() and user_config.is_file():
         defaults.append(user_config)
-    return defaults
+    return MinyDict(
+        {k: v for d in defaults for k, v in yaml.safe_load(d.read_text()).items()}
+    )
 
 
 def resolve(path):
@@ -214,17 +216,68 @@ def write_orion_config(args, outdir):
         (outdir / f"{unique_exp_name}.exp").touch()
 
 
+def load_sbatch_args_from_dir(dir):
+    dir = resolve(dir)
+    sbatch_files = list(dir.glob("sbatch_*.sh"))
+    if not sbatch_files:
+        raise FileNotFoundError(f"No sbatch file found in {str(dir)}")
+    sbatch_file = sbatch_files[0]
+    sbatch_lines = [
+        line.split("#SBATCH")[1].strip()
+        for line in sbatch_file.read_text().splitlines()
+        if "#SBATCH " in line
+    ]
+    sbatch_args = {}
+    for line in sbatch_lines:
+        k, v = (
+            line[2:]
+            if line.startswith("--")
+            else line[1:]
+            if line.startswith("-")
+            else line
+        ).split("=")
+        sbatch_args[k] = v
+    args = {
+        "job_name": sbatch_args["job-name"],
+        "nodes": int(sbatch_args["nodes"]),
+        "ntasks_per_node": int(sbatch_args["ntasks-per-node"]),
+        "partition": sbatch_args["partition"],
+        "cpus": int(sbatch_args["cpus-per-task"]),
+        "mem": sbatch_args["mem"],
+        "gres": sbatch_args["gres"],
+        "output": sbatch_args["output"],
+    }
+    return args
+
+
 if __name__ == "__main__":
     # has the submission been successful?
     success = False
     wandb_offline = ""
     sbatch_py_vars = {}
+    minydra_defaults = discover_minydra_defaults()
 
     # repository root
     root = Path(__file__).resolve().parent
     # parse and resolve args.
     # defaults are loaded and overwritten from the command-line as `arg=value`
-    args = resolved_args(defaults=discover_minydra_defaults())
+    args = resolved_args(defaults=minydra_defaults)
+
+    if args.restart_from_dir or args.continue_from_dir:
+        if args.restart_from_dir and args.continue_from_dir:
+            raise ValueError(
+                "Cannot restart and continue from the same directory. "
+                "Please specify only one of restart_from_dir= or continue_from_dir="
+            )
+        resume_dir = args.restart_from_dir or args.continue_from_dir
+        mode = "restart" if args.restart_from_dir else "continue"
+        sba = load_sbatch_args_from_dir(resume_dir)
+        cli_sba = {k: v for k, v in args.items() if v != minydra_defaults[k]}
+        args = MinyDict({**args, **sba, **cli_sba})
+        if not args.py_args:
+            args.py_args = ""
+        args.py_args += f" --{mode}_from_dir={str(resume_dir)}"
+
     modules = (
         []
         if not args.modules

From e6a754dd697dc7d5fbdd482691a534e465aa2cd6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 13:58:40 -0500
Subject: [PATCH 215/273] enable gradient to log force energy grad

---
 ocpmodels/trainers/base_trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 745fcf06d9..2c98c45a27 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -599,7 +599,6 @@ def train(self):
         """Derived classes should implement this function."""
         pass
 
-    @torch.no_grad()
     def validate(
         self,
         split="val",
@@ -608,6 +607,7 @@ def validate(
         is_final=False,
         is_first=False,
     ):
+        torch.set_grad_enabled(bool(self.config["model"].get("regress_forces", "")))
         if dist_utils.is_master() and not self.silent:
             print()
             logging.info(f"\n >>> 🧐 Evaluating on {split}.")

From 13bc4b5c9322c3d704f08797cb7ce53340184fc0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 17:47:10 -0500
Subject: [PATCH 216/273] update

---
 .../exps/icml/qm9/fanet-best-all-targets.yaml | 207 ++++++++++++++++++
 configs/exps/icml/qm9/fanet-manual-lse.yaml   |   6 +-
 configs/exps/qm7x/schnet-fanet-lse.yaml       |   8 +-
 configs/models/is2re-metadata/lse-shifts.json |   1 +
 launch_exp.py                                 |  16 +-
 ocpmodels/trainers/single_trainer.py          |  12 +-
 6 files changed, 231 insertions(+), 19 deletions(-)
 create mode 100644 configs/exps/icml/qm9/fanet-best-all-targets.yaml
 create mode 100644 configs/models/is2re-metadata/lse-shifts.json

diff --git a/configs/exps/icml/qm9/fanet-best-all-targets.yaml b/configs/exps/icml/qm9/fanet-best-all-targets.yaml
new file mode 100644
index 0000000000..94a35433fb
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-best-all-targets.yaml
@@ -0,0 +1,207 @@
+# scheduler reduce lr on plateau
+job:
+  mem: 12GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+  # dev: true
+  # verbose: true
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, fanet-qm9-lse
+  log_train_every: 200
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type
+    optim: batch_size, lr_initial
+  frame_averaging: 3D
+  fa_frames: random
+  dataset:
+    train:
+      lse_shift: true
+    val:
+      lse_shift: true
+    test:
+      lse_shift: true
+  optim:
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    batch_size: 64
+    initial_lr: 0.0005
+    max_epochs: 1500
+    loss_energy: mse
+    loss_force: mse
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.000001
+    es_warmup_epochs: 500
+    # all below is for the ReduceLROnPlateau scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+    patience: 15
+  model:
+    cutoff: 5.0
+    edge_embed_type: all_rij
+    energy_head: ""
+    graph_norm: True
+    hidden_channels: 110
+    max_num_neighbors: 40
+    mp_type: updownscale_base
+    num_filters: 384
+    num_gaussians: 64
+    num_interactions: 4
+    otf_graph: false
+    pg_hidden_channels: 32
+    phys_embeds: true
+    phys_hidden_channels: 0
+    regress_forces: ""
+    second_layer_MLP: true
+    skip_co: true
+    tag_hidden_channels: 0
+    use_pbc: false
+
+
+runs:
+  - dataset:
+      train:
+        target: 0
+      val:
+        target: 0
+      test:
+        target: 0
+  - dataset:
+      train:
+        target: 1
+      val:
+        target: 1
+      test:
+        target: 1
+  - dataset:
+      train:
+        target: 2
+      val:
+        target: 2
+      test:
+        target: 2
+  - dataset:
+      train:
+        target: 3
+      val:
+        target: 3
+      test:
+        target: 3
+  - dataset:
+      train:
+        target: 4
+      val:
+        target: 4
+      test:
+        target: 4
+  - dataset:
+      train:
+        target: 5
+      val:
+        target: 5
+      test:
+        target: 5
+  - dataset:
+      train:
+        target: 6
+      val:
+        target: 6
+      test:
+        target: 6
+  - dataset:
+      train:
+        target: 7
+      val:
+        target: 7
+      test:
+        target: 7
+  - dataset:
+      train:
+        target: 8
+      val:
+        target: 8
+      test:
+        target: 8
+  - dataset:
+      train:
+        target: 9
+      val:
+        target: 9
+      test:
+        target: 9
+  - dataset:
+      train:
+        target: 10
+      val:
+        target: 10
+      test:
+        target: 10
+  - dataset:
+      train:
+        target: 11
+      val:
+        target: 11
+      test:
+        target: 11
+  - dataset:
+      train:
+        target: 12
+      val:
+        target: 12
+      test:
+        target: 12
+  - dataset:
+      train:
+        target: 13
+      val:
+        target: 13
+      test:
+        target: 13
+  - dataset:
+      train:
+        target: 14
+      val:
+        target: 14
+      test:
+        target: 14
+  - dataset:
+      train:
+        target: 15
+      val:
+        target: 15
+      test:
+        target: 15
+  - dataset:
+      train:
+        target: 16
+      val:
+        target: 16
+      test:
+        target: 16
+  - dataset:
+      train:
+        target: 17
+      val:
+        target: 17
+      test:
+        target: 17
+  - dataset:
+      train:
+        target: 18
+      val:
+        target: 18
+      test:
+        target: 18
diff --git a/configs/exps/icml/qm9/fanet-manual-lse.yaml b/configs/exps/icml/qm9/fanet-manual-lse.yaml
index 877dc62249..d3ddd8f39b 100644
--- a/configs/exps/icml/qm9/fanet-manual-lse.yaml
+++ b/configs/exps/icml/qm9/fanet-manual-lse.yaml
@@ -55,7 +55,7 @@ default:
     hidden_channels: 160
     num_filters: 160
     max_num_neighbors: 30
-    mp_type: updownscale_bae
+    mp_type: updownscale_base
     num_gaussians: 50
     num_interactions: 4
     otf_graph: false
@@ -78,7 +78,7 @@ runs:
       graph_norm: false
       hidden_channels: 96
       max_num_neighbors: 30
-      mp_type: updownscale_bae
+      mp_type: updownscale_base
       num_filters: 224
       num_gaussians: 128
       num_interactions: 4
@@ -92,7 +92,7 @@ runs:
       graph_norm: true
       hidden_channels: 110
       max_num_neighbors: 40
-      mp_type: updownscale_bae
+      mp_type: updownscale_base
       num_filters: 384
       num_gaussians: 64
       num_interactions: 4
diff --git a/configs/exps/qm7x/schnet-fanet-lse.yaml b/configs/exps/qm7x/schnet-fanet-lse.yaml
index 423440f996..26fa07d862 100644
--- a/configs/exps/qm7x/schnet-fanet-lse.yaml
+++ b/configs/exps/qm7x/schnet-fanet-lse.yaml
@@ -58,7 +58,7 @@ default:
       lse_shift: True
     val_id:
       lse_shift: True
-    val_ood:
+    test:
       lse_shift: True
 
 runs:
@@ -182,9 +182,3 @@ runs:
       force_decoder_type: mlp
       regress_forces: direct
 
-
-
-
-
-
-
diff --git a/configs/models/is2re-metadata/lse-shifts.json b/configs/models/is2re-metadata/lse-shifts.json
new file mode 100644
index 0000000000..153cfc0851
--- /dev/null
+++ b/configs/models/is2re-metadata/lse-shifts.json
@@ -0,0 +1 @@
+[0.0, -0.03168837170037106, 0.0, 0.0, 0.0, -0.018916724897170528, -0.07516872833312764, 0.0016345619191071165, 0.013287692526233786, 0.0, 0.0, -0.010099893232592166, 0.0, -0.013262464762207531, -0.0040779598604658, -0.002059577810890844, 0.0022706829024690778, 0.0028789674984750213, 0.0, -0.015036774393646663, -0.043021322980277346, -0.06111072563346156, -0.04210049361780378, -0.03422540877588413, -0.022856732968670863, -0.03103305334142486, -0.026134560547974663, -0.019305524888569604, -0.013493352269468968, -0.008163195381331444, -0.010904761470386284, -0.00405098156937312, 0.001852697634236705, 0.0020690026083794895, 0.0035793557500934155, 0.0, 0.0, -0.013383204575329636, -0.03858810522009537, -0.0576401252068183, -0.059088351366679136, -0.036947509002007, -0.027747679316385585, -0.02807965978344211, -0.01876626301744238, -0.011546582100635678, -0.0055021423373771625, -0.0001371757442072078, -0.006848676469819966, -0.0014570150904904752, -0.0001179312223327613, 0.0002476578996626392, 0.005820879587264792, 0.0, 0.0, -0.016141335226783824, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.052932107762963754, -0.039172881142421144, -0.025784458949892612, -0.028377167031415778, -0.02227405667797154, -0.013746130132304628, -0.007585348194747942, -0.002274605617351183, -0.0002101065095853269, 0.001659669572096274, 0.0035125185202919567, 0.0028364718870182238]
\ No newline at end of file
diff --git a/launch_exp.py b/launch_exp.py
index 7fa87240b8..87046a1e6a 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -257,6 +257,8 @@ def get_args_or_exp(key, args, exp):
             for c, command in enumerate(commands):
                 print(f"Launching job {c+1:3}", end="\r")
                 outputs.append(os.popen(command).read().strip())
+                if " verbose=true" in command.lower():
+                    print(outputs[-1])
         except KeyboardInterrupt:
             is_interrupted = True
         outdir = ROOT / "data" / "exp_outputs" / exp_name
@@ -276,12 +278,12 @@ def get_args_or_exp(key, args, exp):
             with outfile.open("w") as f:
                 f.write(text)
             print("\n\n ✅ Done!")
-            print(util_strings(jobs))
-            # print(f"  • Output written to {str(outfile)}")
-            yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs)
-            print(
-                "  • Experiment summary YAML in ",
-                f"./{str(yml_out.relative_to(Path.cwd()))}",
-            )
+            if jobs:
+                print(util_strings(jobs))
+                yml_out = write_exp_yaml_and_jobs(exp_file, outfile, jobs)
+                print(
+                    "  • Experiment summary YAML in ",
+                    f"./{str(yml_out.relative_to(Path.cwd()))}",
+                )
     else:
         print("Aborting")
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index c8a35a7e90..416af29799 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -271,8 +271,16 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                     print("\n\n >>> 🛑 Loss is NaN. Stopping training.\n\n")
                     self.logger.add_tags(["nan_loss"])
                     return "loss_is_nan"
-                self._backward(loss)
-
+                try:
+                    self._backward(loss)
+                except RuntimeError:
+                    print("\nBackward loss issue")
+                    print(loss)
+                    print(
+                        "Requires grad:",
+                        {k: v.requires_grad for k, v in self.loss.items()},
+                    )
+                    print()
                 # Compute metrics.
                 self.metrics = self.compute_metrics(
                     preds,

From 2a482da2d85fe6e6860f51bc91d6b316d1af5f8f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 18:21:15 -0500
Subject: [PATCH 217/273] update orion search

---
 configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 73 +++++++++++++++++++
 configs/models/tasks/qm9.yaml                 |  6 +-
 2 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
new file mode 100644
index 0000000000..7b9a6bd8ea
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
@@ -0,0 +1,73 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 8GB
+  cpus: 4
+  gres: gpu:1
+  time: 02:50:00
+  partition: long
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, orion
+  log_train_every: 200
+  optim:
+    batch_size: 64
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mse
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.000001
+    es_warmup_epochs: 600
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+    patience: 15
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: num_filters, pg_hidden_channels, num_gaussians
+  frame_averaging: 3D
+  fa_frames: random
+  model:
+    mp_type: updownscale_base
+    edge_embed_type: all_rij
+    energy_head: ""
+    num_gaussians: 100
+    pg_hidden_channels: 32
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 150
+
+  unique_exp_name: fanet-qm9-v6.0.0
+
+  space:
+    optim/max_epochs: fidelity(600, 1000, base=12)
+    optim/lr_initial: loguniform(1e-4, 1e-3, precision=3)
+    model/cutoff: uniform(5, 6, precision=1)
+    model/graph_norm: choices([True, False])
+    model/hidden_channels: choices([100, 200, 300, 400, 500, 1000])
+    model/max_num_neighbours: choices([30, 40, 50])
+    model/num_filters: uniform(7, 16, discrete=True)
+    model/num_interactions: uniform(3, 5, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/second_layer_mlp: choices([True, False])
+    model/skip_co: choices([True, False])
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 3
+      num_brackets: 2
diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml
index 42b4256a1e..ecdc1d1dac 100644
--- a/configs/models/tasks/qm9.yaml
+++ b/configs/models/tasks/qm9.yaml
@@ -30,7 +30,7 @@ default:
       target: 12 # predict atomization energy at 0K at index 12
       seed: 123
       normalize_labels: True # mean and std of target will be set by qm9.py if this is True
-      lse_shift: false
+      lse_shift: true
       indices:
         start: 0
         end: 110000
@@ -38,7 +38,7 @@ default:
       src: /network/projects/ocp/qm9
       target: 12 # predict atomization energy at 0K at index 12
       seed: 123
-      lse_shift: false
+      lse_shift: true
       indices:
         start: 110000
         end: 120000
@@ -46,7 +46,7 @@ default:
       src: /network/projects/ocp/qm9
       target: 12 # predict atomization energy at 0K at index 12
       seed: 123
-      lse_shift: false
+      lse_shift: true
       indices:
         start: 120000
         end: -1

From a59b58271bb9c2c4e42c6c0a33d39363b0750e5b Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 18:22:01 -0500
Subject: [PATCH 218/273] slightly larger time

---
 configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
index 7b9a6bd8ea..d555ad0351 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
@@ -3,7 +3,7 @@ job:
   mem: 8GB
   cpus: 4
   gres: gpu:1
-  time: 02:50:00
+  time: 02:55:00
   partition: long
 
 default:

From 64f63a3304771893a43eb1031cd216316460f2f3 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 18:56:31 -0500
Subject: [PATCH 219/273] typo

---
 configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
index d555ad0351..b8d1f1c396 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
@@ -52,7 +52,7 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 150
 
-  unique_exp_name: fanet-qm9-v6.0.0
+  unique_exp_name: fanet-qm9-v6.0.1
 
   space:
     optim/max_epochs: fidelity(600, 1000, base=12)
@@ -60,7 +60,7 @@ orion:
     model/cutoff: uniform(5, 6, precision=1)
     model/graph_norm: choices([True, False])
     model/hidden_channels: choices([100, 200, 300, 400, 500, 1000])
-    model/max_num_neighbours: choices([30, 40, 50])
+    model/max_num_neighbors: choices([30, 40, 50])
     model/num_filters: uniform(7, 16, discrete=True)
     model/num_interactions: uniform(3, 5, discrete=True)
     model/phys_embeds: choices([True, False])

From ecd8b5eb10dd3d4acc9453702b8dd1dd850d5d8c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 19:12:32 -0500
Subject: [PATCH 220/273] early-stopping file

---
 ocpmodels/trainers/base_trainer.py   |  8 +++++++-
 ocpmodels/trainers/single_trainer.py | 17 ++++++++++++++---
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 2c98c45a27..e816c5239e 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -25,7 +25,7 @@
 from torch.utils.data import DataLoader
 from torch_geometric.data import Batch
 from tqdm import tqdm
-
+from uuid import uuid4
 from ocpmodels.common import dist_utils
 from ocpmodels.common.data_parallel import (
     BalancedBatchSampler,
@@ -54,6 +54,7 @@ def __init__(self, **kwargs):
         model_name = kwargs["model"].pop(
             "name", kwargs.get("model_name", "Unknown - base_trainer issue")
         )
+        self.early_stopping_file = resolve(run_dir) / f"{str(uuid4())}.stop"
         kwargs["model"]["graph_rewiring"] = kwargs.get("graph_rewiring")
 
         self.config = {
@@ -64,6 +65,7 @@ def __init__(self, **kwargs):
             "checkpoint_dir": str(resolve(run_dir) / "checkpoints"),
             "results_dir": str(resolve(run_dir) / "results"),
             "logs_dir": str(resolve(run_dir) / "logs"),
+            "early_stopping_file": str(self.early_stopping_file),
         }
 
         self.sigterm = False
@@ -147,6 +149,10 @@ def __init__(self, **kwargs):
         if dist_utils.is_master() and not self.silent:
             print(f"\n🧰 Trainer config:\n{'-'*18}\n")
             print(yaml.dump(self.config), end="\n\n")
+            print(
+                f"\n\n🚦  Create {str(self.early_stopping_file)}",
+                "to stop the training after the next validation\n",
+            )
         self.load()
 
         self.evaluator = Evaluator(
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 416af29799..3657c8b686 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -346,10 +346,21 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                             checkpoint_file="best_checkpoint.pt",
                             training_state=False,
                         )
-                    if self.early_stopper.should_stop(
-                        current_val_metric, self.scheduler.get_lr(), self.epoch
+                    if (
+                        self.early_stopper.should_stop(
+                            current_val_metric, self.scheduler.get_lr(), self.epoch
+                        )
+                        or self.early_stopping_file.exists()
                     ):
-                        print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n")
+                        if self.early_stopping_file.exists():
+                            print("\n\n >>> 🛑 Early stopping file found.\n\n")
+                            self.early_stopping_file.rename(
+                                self.early_stopping_file.parent
+                                / f"{self.early_stopping_file.stem}_{self.now}.txt"
+                            )
+                        else:
+                            print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n")
+
                         if self.logger:
                             self.logger.add_tags(["E-S"])
                         return self.end_of_training(

From 1e9c96620c53e9c9cbe4554d8e0580f65fa28dee Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 19:17:20 -0500
Subject: [PATCH 221/273] fix post val grad

---
 ocpmodels/trainers/base_trainer.py   | 1 +
 ocpmodels/trainers/single_trainer.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index e816c5239e..2bc1a575c3 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -705,6 +705,7 @@ def validate(
         if self.ema:
             self.ema.restore()
 
+        torch.set_grad_enabled(True)
         return metrics
 
     @abstractmethod
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 3657c8b686..d73a5f2743 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -271,6 +271,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                     print("\n\n >>> 🛑 Loss is NaN. Stopping training.\n\n")
                     self.logger.add_tags(["nan_loss"])
                     return "loss_is_nan"
+
                 try:
                     self._backward(loss)
                 except RuntimeError:
@@ -278,7 +279,7 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                     print(loss)
                     print(
                         "Requires grad:",
-                        {k: v.requires_grad for k, v in self.loss.items()},
+                        {k: v.requires_grad for k, v in loss.items()},
                     )
                     print()
                 # Compute metrics.

From be17a53bf9e857410a99a95388ef5545350b5fc7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 19:29:41 -0500
Subject: [PATCH 222/273] typo `second_layer_MLP`

---
 configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
index b8d1f1c396..b0af9eec5b 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
@@ -52,7 +52,7 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 150
 
-  unique_exp_name: fanet-qm9-v6.0.1
+  unique_exp_name: fanet-qm9-v6.0.2
 
   space:
     optim/max_epochs: fidelity(600, 1000, base=12)
@@ -64,7 +64,7 @@ orion:
     model/num_filters: uniform(7, 16, discrete=True)
     model/num_interactions: uniform(3, 5, discrete=True)
     model/phys_embeds: choices([True, False])
-    model/second_layer_mlp: choices([True, False])
+    model/second_layer_MLP: choices([True, False])
     model/skip_co: choices([True, False])
   algorithms:
     asha:

From a932725e76d3ea4326f84e0dd29825bbb2fb850f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 19:39:15 -0500
Subject: [PATCH 223/273] new qm7x exp

---
 configs/exps/qm7x/fanet-lse-v1.yaml | 150 ++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 configs/exps/qm7x/fanet-lse-v1.yaml

diff --git a/configs/exps/qm7x/fanet-lse-v1.yaml b/configs/exps/qm7x/fanet-lse-v1.yaml
new file mode 100644
index 0000000000..5dd3da7def
--- /dev/null
+++ b/configs/exps/qm7x/fanet-lse-v1.yaml
@@ -0,0 +1,150 @@
+# trainset has 4068193 samples
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+  env: ocp-a100
+
+default:
+  config: fanet-qm7x-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x
+  phys_hidden_channels: 0
+  phys_embeds: False
+  energy_head: False
+  pg_hidden_channels: 0
+  tag_hidden_channels: 0
+  frame_averaging: ""
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces
+    optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
+  optim:
+    batch_size: 100
+    max_steps: 2000000
+    warmup_steps: 3000
+    lr_initial: 0.00025
+    eval_every: 0.201
+    energy_coefficient: 1
+    energy_grad_coefficient: 0
+    force_coefficient: 100
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mae
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+  model:
+    att_heads: 1
+    complex_mp: false
+    cutoff: 5.0
+    edge_embed_type: all_rij
+    energy_head: false
+    force_decoder_type: mlp
+    graph_norm: false
+    hidden_channels: 128
+    max_num_neighbors: 40
+    mp_type: updownscale_base
+    num_filters: 128
+    num_gaussians: 20
+    num_interactions: 4
+    pg_hidden_channels: 0
+    phys_embeds: false
+    second_layer_MLP: false
+    skip_co: false
+    tag_hidden_channels: 0
+  dataset:
+    train:
+      rescale_with_hof: False
+      lse_shift: True
+    val_id:
+      lse_shift: True
+    test:
+      lse_shift: True
+
+runs:
+  - {}
+  - model:
+      regress_forces: direct_with_gradient_target
+
+  - model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+
+  - model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+
+  - model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+
+  - model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+
+  - model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+      complex_mlp: true
+
+  - model:
+      num_interactions: 4
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+      complex_mlp: true
+
+  - model:
+      num_interactions: 4
+      num_filters: 256
+      hidden_channels: 256
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+      complex_mlp: true
+
+  - model:
+      num_interactions: 4
+      num_filters: 256
+      hidden_channels: 256
+      num_gaussians: 20
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+      complex_mlp: true
+

From a116ed2f92965e943fb513c39d2f11842a9eab9f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sat, 21 Jan 2023 20:17:19 -0500
Subject: [PATCH 224/273] update exps

---
 ...ts.yaml => fanet-best-v5-all-targets.yaml} |   0
 .../icml/qm9/fanet-manual-lse-best-v5.yaml    | 217 ++++++++++++++++++
 configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml |   1 +
 configs/exps/qm7x/fanet-lse-v1.yaml           |  23 +-
 4 files changed, 237 insertions(+), 4 deletions(-)
 rename configs/exps/icml/qm9/{fanet-best-all-targets.yaml => fanet-best-v5-all-targets.yaml} (100%)
 create mode 100644 configs/exps/icml/qm9/fanet-manual-lse-best-v5.yaml

diff --git a/configs/exps/icml/qm9/fanet-best-all-targets.yaml b/configs/exps/icml/qm9/fanet-best-v5-all-targets.yaml
similarity index 100%
rename from configs/exps/icml/qm9/fanet-best-all-targets.yaml
rename to configs/exps/icml/qm9/fanet-best-v5-all-targets.yaml
diff --git a/configs/exps/icml/qm9/fanet-manual-lse-best-v5.yaml b/configs/exps/icml/qm9/fanet-manual-lse-best-v5.yaml
new file mode 100644
index 0000000000..8632b1132b
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-manual-lse-best-v5.yaml
@@ -0,0 +1,217 @@
+# scheduler reduce lr on plateau
+job:
+  mem: 12GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, fanet-qm9-lse
+  log_train_every: 200
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type
+    optim: batch_size, lr_initial
+  frame_averaging: 3D
+  fa_frames: random
+  dataset:
+    train:
+      lse_shift: true
+    val:
+      lse_shift: true
+    test:
+      lse_shift: true
+  optim:
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    batch_size: 64
+    initial_lr: 0.0005
+    max_epochs: 1500
+    loss_energy: mse
+    loss_force: mse
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.00001
+    es_warmup_epochs: 500
+    # all below is for the ReduceLROnPlateau scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+    patience: 15
+  model:
+    act: swish
+    att_heads: 1
+    complex_mp: false
+    cutoff: 5.0
+    edge_embed_type: all_rij
+    energy_head: ''
+    force_decoder_type: null
+    graph_norm: true
+    hidden_channels: 110
+    max_num_neighbors: 40
+    mp_type: updownscale_base
+    num_filters: 384
+    num_gaussians: 64
+    num_interactions: 4
+    pg_hidden_channels: 32
+    phys_embeds: true
+    second_layer_MLP: true
+    skip_co: true
+    tag_hidden_channels: 0
+
+
+runs:
+  - model:
+      complex_mp: false
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 110
+      max_num_neighbors: 40
+      mp_type: updownscale_base
+      num_filters: 384
+      num_gaussians: 64
+      num_interactions: 4
+      pg_hidden_channels: 32
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+
+  - model:
+      complex_mp: false
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 256
+      max_num_neighbors: 40
+      num_filters: 384
+      num_gaussians: 64
+      num_interactions: 4
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+
+  - model:
+      complex_mp: false
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 256
+      max_num_neighbors: 40
+      num_filters: 384
+      num_gaussians: 64
+      num_interactions: 4
+      phys_embeds: false
+      second_layer_MLP: true
+      skip_co: true
+
+  - model:
+      complex_mp: false
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 384
+      max_num_neighbors: 40
+      num_filters: 384
+      num_gaussians: 50
+      num_interactions: 4
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+
+  - model:
+      complex_mp: true
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 384
+      max_num_neighbors: 40
+      num_filters: 384
+      num_gaussians: 50
+      num_interactions: 5
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+
+  - optim:
+      initial_lr: 0.001
+    model:
+      complex_mp: true
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 384
+      max_num_neighbors: 40
+      num_filters: 384
+      num_gaussians: 50
+      num_interactions: 5
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+
+  - optim:
+      initial_lr: 0.001
+      batch_size: 128
+    model:
+      complex_mp: true
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 384
+      max_num_neighbors: 40
+      num_filters: 384
+      num_gaussians: 50
+      num_interactions: 5
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+
+  - optim:
+      initial_lr: 0.001
+      batch_size: 128
+    model:
+      complex_mp: true
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 100
+      max_num_neighbors: 40
+      num_filters: 100
+      num_gaussians: 100
+      num_interactions: 3
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+
+  - optim:
+      initial_lr: 0.001
+      batch_size: 1024
+    model:
+      complex_mp: true
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 100
+      max_num_neighbors: 40
+      num_filters: 100
+      num_gaussians: 100
+      num_interactions: 3
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
+
+  - optim:
+      initial_lr: 0.001
+      batch_size: 128
+    model:
+      complex_mp: true
+      cutoff: 5.0
+      graph_norm: true
+      hidden_channels: 512
+      max_num_neighbors: 40
+      num_filters: 256
+      num_gaussians: 50
+      num_interactions: 3
+      phys_embeds: true
+      second_layer_MLP: true
+      skip_co: true
diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
index b0af9eec5b..3ca3a58048 100644
--- a/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v6.yaml
@@ -66,6 +66,7 @@ orion:
     model/phys_embeds: choices([True, False])
     model/second_layer_MLP: choices([True, False])
     model/skip_co: choices([True, False])
+    model/complex_mp: choices([True, False])
   algorithms:
     asha:
       seed: 123
diff --git a/configs/exps/qm7x/fanet-lse-v1.yaml b/configs/exps/qm7x/fanet-lse-v1.yaml
index 5dd3da7def..f2afd44367 100644
--- a/configs/exps/qm7x/fanet-lse-v1.yaml
+++ b/configs/exps/qm7x/fanet-lse-v1.yaml
@@ -116,7 +116,16 @@ runs:
       regress_forces: direct_with_gradient_target
       graph_norm: true
       skip_co: true
-      complex_mlp: true
+
+  - model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+      complex_mp: true
 
   - model:
       num_interactions: 4
@@ -126,7 +135,6 @@ runs:
       regress_forces: direct_with_gradient_target
       graph_norm: true
       skip_co: true
-      complex_mlp: true
 
   - model:
       num_interactions: 4
@@ -136,7 +144,6 @@ runs:
       regress_forces: direct_with_gradient_target
       graph_norm: true
       skip_co: true
-      complex_mlp: true
 
   - model:
       num_interactions: 4
@@ -146,5 +153,13 @@ runs:
       regress_forces: direct_with_gradient_target
       graph_norm: true
       skip_co: true
-      complex_mlp: true
 
+  - model:
+      num_interactions: 4
+      num_filters: 256
+      hidden_channels: 256
+      num_gaussians: 20
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+      complex_mp: true

From 56c8e6e6d9330443092f8842972830e1be15689f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 12:01:45 -0500
Subject: [PATCH 225/273] soft stop script

---
 configs/exps/qm7x/fanet-lse-v2.yaml   | 220 ++++++++++++++++++++++++++
 configs/exps/qm7x/fanet-orion-v1.yaml |   0
 configs/models/tasks/qm7x.yaml        |   1 +
 ocpmodels/trainers/single_trainer.py  |   3 +-
 scripts/soft_stop_jobs.py             |  28 ++++
 5 files changed, 251 insertions(+), 1 deletion(-)
 create mode 100644 configs/exps/qm7x/fanet-lse-v2.yaml
 create mode 100644 configs/exps/qm7x/fanet-orion-v1.yaml
 create mode 100644 scripts/soft_stop_jobs.py

diff --git a/configs/exps/qm7x/fanet-lse-v2.yaml b/configs/exps/qm7x/fanet-lse-v2.yaml
new file mode 100644
index 0000000000..dedca23aee
--- /dev/null
+++ b/configs/exps/qm7x/fanet-lse-v2.yaml
@@ -0,0 +1,220 @@
+# trainset has 4068193 samples
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+  env: ocp-a100
+
+default:
+  config: fanet-qm7x-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces
+    optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
+  energy_head: False
+  frame_averaging: 3D
+  fa_frames: random
+  optim:
+    batch_size: 100
+    max_steps: 2000000
+    warmup_steps: 3000
+    lr_initial: 0.00025
+    eval_every: 0.201
+    energy_coefficient: 1
+    energy_grad_coefficient: 0
+    force_coefficient: 100
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mae
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+  model:
+    att_heads: 1
+    complex_mp: false
+    cutoff: 5.0
+    edge_embed_type: all_rij
+    energy_head: false
+    force_decoder_type: mlp
+    graph_norm: false
+    hidden_channels: 128
+    max_num_neighbors: 40
+    mp_type: updownscale_base
+    num_filters: 128
+    num_gaussians: 20
+    num_interactions: 4
+    pg_hidden_channels: 32
+    phys_embeds: true
+    regress_forces: direct
+    second_layer_MLP: false
+    skip_co: false
+    tag_hidden_channels: 0
+  dataset:
+    train:
+      rescale_with_hof: False
+      lse_shift: True
+    val_id:
+      lse_shift: True
+    test:
+      lse_shift: True
+
+runs:
+  - {}
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      regress_forces: direct_with_gradient_target
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 0
+      force_coefficient: 100
+    model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 6
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+      complex_mp: true
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 4
+      num_filters: 256
+      hidden_channels: 512
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 4
+      num_filters: 256
+      hidden_channels: 256
+      num_gaussians: 100
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 4
+      num_filters: 256
+      hidden_channels: 256
+      num_gaussians: 20
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 3
+      num_filters: 512
+      hidden_channels: 1024
+      num_gaussians: 20
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+
+  - optim:
+      energy_coefficient: 1
+      energy_grad_coefficient: 10
+      force_coefficient: 100
+    model:
+      num_interactions: 4
+      num_filters: 256
+      hidden_channels: 256
+      num_gaussians: 20
+      regress_forces: direct_with_gradient_target
+      graph_norm: true
+      skip_co: true
+      complex_mp: true
diff --git a/configs/exps/qm7x/fanet-orion-v1.yaml b/configs/exps/qm7x/fanet-orion-v1.yaml
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/configs/models/tasks/qm7x.yaml b/configs/models/tasks/qm7x.yaml
index 10919390a2..defb898e91 100644
--- a/configs/models/tasks/qm7x.yaml
+++ b/configs/models/tasks/qm7x.yaml
@@ -1,6 +1,7 @@
 default:
   trainer: single
   logger: wandb
+  eval_on_test: True
 
   model:
     otf_graph: False
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index d73a5f2743..289e452f52 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -355,9 +355,10 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                     ):
                         if self.early_stopping_file.exists():
                             print("\n\n >>> 🛑 Early stopping file found.\n\n")
+                            now = self.now.replace(" ", "_").replace(":", "-")
                             self.early_stopping_file.rename(
                                 self.early_stopping_file.parent
-                                / f"{self.early_stopping_file.stem}_{self.now}.txt"
+                                / f"{self.early_stopping_file.stem}_{now}.stopped"
                             )
                         else:
                             print(f"\n\n >>> 🛑 {self.early_stopper.reason}\n\n")
diff --git a/scripts/soft_stop_jobs.py b/scripts/soft_stop_jobs.py
new file mode 100644
index 0000000000..6abae6ae0b
--- /dev/null
+++ b/scripts/soft_stop_jobs.py
@@ -0,0 +1,28 @@
+from minydra import resolved_args
+from pathlib import Path
+import os
+import re
+
+if __name__ == "__main__":
+    args = resolved_args()
+    assert "jobs" in args
+    jobs = [
+        j.strip()
+        for j in str(args.jobs).replace(",", " ").replace("  ", " ").split(" ")
+    ]
+    runs = Path(os.environ["SCRATCH"]) / "ocp" / "runs"
+    outs = [(runs / j / "output-0.txt") for j in jobs]
+    confirmed = args.no_confirm or (
+        "y"
+        in input(f"\nAbout to early-stop jobs:\n {', '.join(jobs)}\nContinue? [y/n]: ")
+    )
+    if confirmed:
+        for out in outs:
+            if not out.exists():
+                print(f"Output file for job {out.parent.name} not found")
+                continue
+            stop = re.findall(r"early_stopping_file: (.+)", out.read_text())
+            if stop:
+                Path(stop[0]).touch()
+            else:
+                print(f"Early stopping file not found in {str(out)}")

From 29776e88d7375cc5a0f4cb5cde75ac0b5120562a Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 12:32:17 -0500
Subject: [PATCH 226/273] update objective logging

---
 configs/exps/icml/qm7x/fanet-orion-v1.yaml | 108 +++++++++++++++++++++
 configs/exps/qm7x/fanet-orion-v1.yaml      |   0
 ocpmodels/trainers/base_trainer.py         |   2 +
 3 files changed, 110 insertions(+)
 create mode 100644 configs/exps/icml/qm7x/fanet-orion-v1.yaml
 delete mode 100644 configs/exps/qm7x/fanet-orion-v1.yaml

diff --git a/configs/exps/icml/qm7x/fanet-orion-v1.yaml b/configs/exps/icml/qm7x/fanet-orion-v1.yaml
new file mode 100644
index 0000000000..90e4986367
--- /dev/null
+++ b/configs/exps/icml/qm7x/fanet-orion-v1.yaml
@@ -0,0 +1,108 @@
+# trainset has 4068193 samples
+job:
+  mem: 32GB
+  cpus: 5
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+  env: ocp-a100
+
+default:
+  config: fanet-qm7x-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces
+    optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
+  log_train_every: 250
+  energy_head: False
+  frame_averaging: 3D
+  fa_frames: random
+  optim:
+    batch_size: 100
+    max_steps: 2000000
+    warmup_steps: 3000
+    lr_initial: 0.00025
+    eval_every: 0.201
+    energy_coefficient: 1
+    energy_grad_coefficient: 0
+    force_coefficient: 100
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mae
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+  model:
+    att_heads: 1
+    complex_mp: false
+    cutoff: 5.0
+    edge_embed_type: all_rij
+    energy_head: false
+    force_decoder_type: mlp
+    graph_norm: false
+    hidden_channels: 128
+    max_num_neighbors: 40
+    mp_type: updownscale_base
+    num_filters: 128
+    num_gaussians: 20
+    num_interactions: 4
+    pg_hidden_channels: 32
+    phys_embeds: true
+    regress_forces: direct
+    second_layer_MLP: false
+    skip_co: false
+    tag_hidden_channels: 0
+  dataset:
+    train:
+      rescale_with_hof: False
+      lse_shift: True
+    val_id:
+      lse_shift: True
+    test:
+      lse_shift: True
+  orion_mult_factor:
+    value: 25
+    targets: num_filters, num_gaussians, force_coefficient
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 150
+
+  unique_exp_name: fanet-qm7x-v1.0.0
+
+  space:
+    optim/max_steps: fidelity(100000, 2000000, base=2)
+    optim/lr_initial: loguniform(1e-4, 1e-3, precision=3)
+    optim/energy_grad_coefficient: uniform(1, 25, discrete=True)
+    optim/force_coefficient: gaussian(4, 0.5, discrete=True)
+
+    model/complex_mp: choices([True, False])
+    model/cutoff: uniform(4.5, 6.5, precision=1)
+    model/edge_embed_type: all_rij
+    model/graph_norm: choices([True, False])
+    model/hidden_channels: choices([100, 200, 300, 400, 500, 1000])
+    model/max_num_neighbors: choices([30, 40, 50])
+    model/num_filters: uniform(7, 16, discrete=True)
+    model/num_gaussians: uniform(1, 5, discrete=True)
+    model/num_interactions: uniform(3, 7, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/regress_forces: choices(['direct', 'direct_with_gradient_target'])
+    model/second_layer_MLP: choices([True, False])
+    model/skip_co: choices([True, False])
+
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 8
+      num_brackets: 2
diff --git a/configs/exps/qm7x/fanet-orion-v1.yaml b/configs/exps/qm7x/fanet-orion-v1.yaml
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 2bc1a575c3..b6db9fb25c 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -866,6 +866,8 @@ def eval_all_splits(
             if self.config["model"].get("regress_forces", False):
                 overall_forces_mae = cumulated_forces_mae / len(all_splits)
                 self.logger.log({"Overall Forces MAE": overall_forces_mae})
+                self.objective = (overall_energy_mae + overall_forces_mae) / 2
+            self.logger.log({"Objective": self.objective})
 
         # Run on test split
         if final and "test" in self.config["dataset"] and self.eval_on_test:

From cce9213816ca362525a2248d7cd863929b2516c9 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 12:36:34 -0500
Subject: [PATCH 227/273] don't print all orion outputs

---
 configs/exps/icml/qm7x/fanet-orion-v1.yaml | 4 ++--
 launch_exp.py                              | 6 +++++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/configs/exps/icml/qm7x/fanet-orion-v1.yaml b/configs/exps/icml/qm7x/fanet-orion-v1.yaml
index 90e4986367..9e8bf2e19a 100644
--- a/configs/exps/icml/qm7x/fanet-orion-v1.yaml
+++ b/configs/exps/icml/qm7x/fanet-orion-v1.yaml
@@ -4,7 +4,7 @@ job:
   cpus: 5
   gres: gpu:16gb:1
   partition: long
-  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-3
   env: ocp-a100
 
 default:
@@ -77,7 +77,7 @@ default:
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
-  n_jobs: 150
+  n_jobs: 50
 
   unique_exp_name: fanet-qm7x-v1.0.0
 
diff --git a/launch_exp.py b/launch_exp.py
index 87046a1e6a..e25963210f 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -172,6 +172,7 @@ def get_args_or_exp(key, args, exp):
 
 if __name__ == "__main__":
     is_interrupted = False
+    n_jobs = None
     args = resolved_args()
     assert "exp" in args
     regex = args.get("match", ".*")
@@ -235,7 +236,10 @@ def get_args_or_exp(key, args, exp):
 
     commands = [c for c in commands if re.findall(regex, c)]
 
-    print(f"🔥 About to run {len(commands)} jobs:\n\n • " + "\n\n  • ".join(commands))
+    print(
+        f"🔥 About to run {len(commands)} jobs:\n\n • "
+        + "\n\n  • ".join(commands if n_jobs is None else commands[:1])
+    )
 
     separator = "\n" * 4 + f"{'#' * 80}\n" * 4 + "\n" * 4
     text = "<><><> Experiment command: $ " + " ".join(["python"] + sys.argv)

From effa8916029c23fc226aaccfad184bb20f7f67a2 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 20:20:49 -0500
Subject: [PATCH 228/273] improve `no_confirm` arg

---
 launch_exp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index e25963210f..4fe5eb1b1c 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -247,9 +247,9 @@ def get_args_or_exp(key, args, exp):
     text += "\n<><><> Experiment config:\n\n-----" + exp_file.read_text() + "-----"
     text += "\n<><><> Experiment runs:\n\n • " + "\n\n  • ".join(commands) + separator
 
-    confirm = args.no_confirm or input("\n🚦 Confirm? [y/n] : ")
+    confirm = args.no_confirm or "y" in input("\n🚦 Confirm? [y/n] : ")
 
-    if confirm == "y":
+    if confirm:
         try:
             if "orion" in exp:
                 search_path.parent.mkdir(exist_ok=True, parents=True)

From 5b6b5c21f46e60b4bdf9560c2e99556fcd15e00c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 20:21:00 -0500
Subject: [PATCH 229/273] handle map function

---
 ocpmodels/common/timer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/common/timer.py b/ocpmodels/common/timer.py
index cb38f1c731..d30935fd26 100644
--- a/ocpmodels/common/timer.py
+++ b/ocpmodels/common/timer.py
@@ -46,7 +46,7 @@ def reset(self):
         self.times = defaultdict(list)
         self.timers = {}
 
-    def prepare_for_logging(self):
+    def prepare_for_logging(self, map_func=lambda x: x):
         """
         Computes mean and standard deviation of all timers.
         Returns a tuple: (mean_times_dict, std_times_dict)
@@ -57,8 +57,9 @@ def prepare_for_logging(self):
         mean_times = {}
         std_times = {}
         for k, v in self.times.items():
-            mean_times[k] = np.mean(v)
-            std_times[k] = np.std(v)
+            data = list(map(map_func, v))
+            mean_times[k] = np.mean(data)
+            std_times[k] = np.std(data)
         return mean_times, std_times
 
     def next(self, name, ignore=None):

From 874083c5d53aad512a375ab062d611dc4e2795f2 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 20:21:09 -0500
Subject: [PATCH 230/273] silent of trainer is silent

---
 ocpmodels/modules/scheduler.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/modules/scheduler.py b/ocpmodels/modules/scheduler.py
index ca440b1854..6eba1fd39b 100644
--- a/ocpmodels/modules/scheduler.py
+++ b/ocpmodels/modules/scheduler.py
@@ -22,10 +22,11 @@ class LRScheduler:
         optimizer (obj): torch optim object
     """
 
-    def __init__(self, optimizer, optim_config):
+    def __init__(self, optimizer, optim_config, silent=False):
         self.optimizer = optimizer
         self.optim_config = optim_config.copy()
         self.warmup_scheduler = None
+        self.silent = silent
         if self.optim_config.get("scheduler"):
             self.scheduler_type = self.optim_config["scheduler"]
         else:
@@ -47,9 +48,11 @@ def scheduler_lambda_fn(x):
             T_max = self.optim_config.get("fidelity_max_steps")
             if T_max is None:
                 T_max = self.optim_config["max_steps"]
-                print(f"Using max_steps for scheduler -> {T_max}")
+                if not self.silent:
+                    print(f"Using max_steps for scheduler -> {T_max}")
             else:
-                print(f"Using fidelity_max_steps for scheduler -> {T_max}")
+                if not self.silent:
+                    print(f"Using fidelity_max_steps for scheduler -> {T_max}")
 
             self.warmup_scheduler = warmup.ExponentialWarmup(
                 self.optimizer, warmup_period=self.optim_config["warmup_steps"]

From fb555967e7f2add4abf471a333c82e20a776896c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 20:21:26 -0500
Subject: [PATCH 231/273] rename file

---
 .../{measure_val_times.py => legacy_phast_measure_val_times.py}   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{measure_val_times.py => legacy_phast_measure_val_times.py} (100%)

diff --git a/scripts/measure_val_times.py b/scripts/legacy_phast_measure_val_times.py
similarity index 100%
rename from scripts/measure_val_times.py
rename to scripts/legacy_phast_measure_val_times.py

From 50a2467ba89ccbf12948470343f0fe02f7b09d35 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 20:21:36 -0500
Subject: [PATCH 232/273] script to keep running jobs

---
 scripts/watch_and_run_orion_jobs.py | 39 +++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100644 scripts/watch_and_run_orion_jobs.py

diff --git a/scripts/watch_and_run_orion_jobs.py b/scripts/watch_and_run_orion_jobs.py
new file mode 100644
index 0000000000..c23945c99d
--- /dev/null
+++ b/scripts/watch_and_run_orion_jobs.py
@@ -0,0 +1,39 @@
+import os
+from sys import exit
+from time import sleep
+
+from minydra import resolved_args
+
+
+def n_jobs():
+    return len(os.popen("squeue -u $USER").read().splitlines()) - 1
+
+
+if __name__ == "__main__":
+    args = resolved_args()
+    assert "exp" in args
+    hours = args.get("hours", 1)
+    min_jobs = args.get("min_jobs", 1)
+    cmd = f"python launch_exp.py exp={args.exp} no_confirm='y'" + "n_jobs={new_jobs}"
+    print(
+        f"\nChecking every {hours} hours for new jobs to launch for exp {args.exp}",
+        f"so that you always have at least {min_jobs} jobs running\n",
+    )
+
+    if "y" not in input("Continue? [y/n]: "):
+        exit()
+
+    i = 0
+
+    try:
+        while True:
+            j = n_jobs()
+            print(f"\nNumber of jobs at iteration {i}: {j}")
+            if j < min_jobs:
+                new_jobs = min_jobs - j
+                print(f"  Launching {new_jobs} jobs at iteration {i}")
+                os.system(cmd.format(new_jobs=new_jobs))
+            i += 1
+            sleep(hours * 60 * 60)
+    except KeyboardInterrupt:
+        print("Exiting...")

From f7f9550dfdc3fd67e166f7dff5f02c7f2afa9e79 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 20:22:11 -0500
Subject: [PATCH 233/273] add silent mode

---
 ocpmodels/common/utils.py | 34 +++++++++++++++++++---------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 57e91e2944..31185919e9 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -866,7 +866,7 @@ def load_config_legacy(path: str, previous_includes: list = []):
     return config, duplicates_warning, duplicates_error
 
 
-def set_cpus_to_workers(config):
+def set_cpus_to_workers(config, silent=False):
     if not config.get("no_cpus_to_workers"):
         cpus = count_cpus()
         gpus = count_gpus()
@@ -875,7 +875,7 @@ def set_cpus_to_workers(config):
                 workers = cpus - 1
             else:
                 workers = cpus // gpus
-            if not config["silent"]:
+            if not config["silent"] and not silent:
                 print(
                     f"🏭 Overriding num_workers from {config['optim']['num_workers']}",
                     f"to {workers} to match the machine's CPUs.",
@@ -960,10 +960,10 @@ def load_config(config_str):
     return config
 
 
-def build_config(args, args_override):
+def build_config(args, args_override, silent=False):
     config = overrides = continue_config = {}
 
-    if args.config_yml:
+    if hasattr(args, "config_yml") and args.config_yml:
         raise ValueError(
             "Using LEGACY config format. Please update your config to the new format."
         )
@@ -1001,18 +1001,22 @@ def build_config(args, args_override):
                     if "orion" in k or "fidelity" in k:
                         dels[k] = copy.deepcopy(continue_config[k])
                         continue_config[k] = None
+                if not silent:
+                    print(
+                        "🅾️  Removing orion config from continue config. Set to None:",
+                        "{"
+                        + ", ".join([f"{k}: {v}->None" for k, v in dels.items()])
+                        + "}",
+                    )
+            if not silent:
                 print(
-                    "🅾️  Removing orion config from continue config. Set to None:",
-                    "{" + ", ".join([f"{k}: {v}->None" for k, v in dels.items()]) + "}",
-                )
-            print(
-                f"✅ Loading config from directory {str(cont_dir)}"
-                + (
-                    f" and latest checkpoint: {latest_ckpt}"
-                    if args.continue_from_dir
-                    else " (restarting from scratch)"
+                    f"✅ Loading config from directory {str(cont_dir)}"
+                    + (
+                        f" and latest checkpoint: {latest_ckpt}"
+                        if args.continue_from_dir
+                        else " (restarting from scratch)"
+                    )
                 )
-            )
             args.config = continue_config["config"]
 
     if args.config is None:
@@ -1075,7 +1079,7 @@ def build_config(args, args_override):
         config = merge_dicts(config, cli)
 
     check_regress_forces(config)
-    config = set_cpus_to_workers(config)
+    config = set_cpus_to_workers(config, silent)
     config = set_qm9_target_stats(config)
     config = set_qm7x_target_stats(config)
     config = override_drac_paths(config)

From e137faecee94b35326aa2387a81df2f2273f4ed0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 20:22:28 -0500
Subject: [PATCH 234/273] add inference mode in forward

---
 ocpmodels/common/data_parallel.py    |  6 ++--
 ocpmodels/models/base_model.py       | 25 +++++++-------
 ocpmodels/trainers/base_trainer.py   | 49 ++++++++++++++++------------
 ocpmodels/trainers/single_trainer.py | 17 ++++++----
 4 files changed, 56 insertions(+), 41 deletions(-)

diff --git a/ocpmodels/common/data_parallel.py b/ocpmodels/common/data_parallel.py
index 9c57b6bc71..6f0ceca86b 100644
--- a/ocpmodels/common/data_parallel.py
+++ b/ocpmodels/common/data_parallel.py
@@ -48,12 +48,12 @@ def __init__(self, module, output_device, num_gpus):
                 output_device=self.src_device,
             )
 
-    def forward(self, batch_list):
+    def forward(self, batch_list, **kwargs):
         if self.cpu:
-            return self.module(batch_list[0])
+            return self.module(batch_list[0], **kwargs)
 
         if len(self.device_ids) == 1:
-            return self.module(batch_list[0].to(f"cuda:{self.device_ids[0]}"))
+            return self.module(batch_list[0].to(f"cuda:{self.device_ids[0]}"), **kwargs)
 
         for t in chain(self.module.parameters(), self.module.buffers()):
             if t.device != self.src_device:
diff --git a/ocpmodels/models/base_model.py b/ocpmodels/models/base_model.py
index 3a0ac3a93f..ad769b6c09 100644
--- a/ocpmodels/models/base_model.py
+++ b/ocpmodels/models/base_model.py
@@ -31,11 +31,12 @@ def energy_forward(self, data):
     def forces_forward(self, preds):
         raise NotImplementedError
 
-    def forward(self, data):
+    def forward(self, data, mode="train"):
         grad_forces = forces = None
 
         # energy gradient w.r.t. positions will be computed
-        data.pos.requires_grad_(True)
+        if mode == "train" or self.regress_forces == "from_energy":
+            data.pos.requires_grad_(True)
 
         # predict energy
         preds = self.energy_forward(data)
@@ -46,12 +47,13 @@ def forward(self, data):
                 # predict forces
                 forces = self.forces_forward(preds)
 
-            if "gemnet" in self.__class__.__name__.lower():
-                # gemnet forces are already computed
-                grad_forces = forces
-            else:
-                # compute forces from energy gradient
-                grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"])
+            if mode == "train" or self.regress_forces == "from_energy":
+                if "gemnet" in self.__class__.__name__.lower():
+                    # gemnet forces are already computed
+                    grad_forces = forces
+                else:
+                    # compute forces from energy gradient
+                    grad_forces = self.forces_as_energy_grad(data.pos, preds["energy"])
 
             if self.regress_forces == "from_energy":
                 # predicted forces are the energy gradient
@@ -59,9 +61,10 @@ def forward(self, data):
             elif self.regress_forces in {"direct", "direct_with_gradient_target"}:
                 # predicted forces are the model's direct forces
                 preds["forces"] = forces
-                # store the energy gradient as the target. Used for metrics
-                # only in "direct" mode.
-                preds["forces_grad_target"] = grad_forces.detach()
+                if mode == "train":
+                    # store the energy gradient as the target. Used for metrics
+                    # only in "direct" mode.
+                    preds["forces_grad_target"] = grad_forces.detach()
             else:
                 raise ValueError(
                     f"Unknown forces regression mode {self.regress_forces}"
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index b6db9fb25c..9252b2a9af 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -258,19 +258,20 @@ def load_datasets(self):
                             * (n_train / batch_size)
                         )
                     )
-                    print(
-                        "Setting fidelity_max_steps to {}".format(
-                            self.config["optim"]["fidelity_max_steps"]
+                    if not self.silent:
+                        print(
+                            "Setting fidelity_max_steps to {}".format(
+                                self.config["optim"]["fidelity_max_steps"]
+                            )
                         )
-                    )
 
                 if max_samples > 0:
-                    if max_epochs > 0:
+                    if max_epochs > 0 and not self.silent:
                         print(
                             "\nWARNING: Both max_samples and max_epochs are set.",
                             "Using max_samples.",
                         )
-                    if max_steps > 0:
+                    if max_steps > 0 and not self.silent:
                         print(
                             "WARNING: Both max_samples and max_steps are set.",
                             "Using max_samples.\n",
@@ -282,7 +283,7 @@ def load_datasets(self):
                         np.ceil(max_samples / batch_size)
                     )
                 elif max_steps > 0:
-                    if max_epochs > 0:
+                    if max_epochs > 0 and not self.silent:
                         print(
                             "\nWARNING: Both max_steps and max_epochs are set.",
                             "Using max_steps.\n",
@@ -290,23 +291,25 @@ def load_datasets(self):
                     self.config["optim"]["max_epochs"] = int(
                         np.ceil(max_steps / (n_train / batch_size))
                     )
-                    print(
-                        "Setting max_epochs to",
-                        self.config["optim"]["max_epochs"],
-                        f"from max_steps ({max_steps}),",
-                        f"dataset length ({n_train}),",
-                        f"and batch_size ({batch_size})\n",
-                    )
+                    if not self.silent:
+                        print(
+                            "Setting max_epochs to",
+                            self.config["optim"]["max_epochs"],
+                            f"from max_steps ({max_steps}),",
+                            f"dataset length ({n_train}),",
+                            f"and batch_size ({batch_size})\n",
+                        )
                 else:
                     self.config["optim"]["max_steps"] = int(
                         np.ceil(max_epochs * (n_train / batch_size))
                     )
-                    print(
-                        "Setting max_steps to ",
-                        f"{self.config['optim']['max_steps']} from",
-                        f"max_epochs ({max_epochs}), dataset length",
-                        f"({n_train}), and batch_size ({batch_size})\n",
-                    )
+                    if not self.silent:
+                        print(
+                            "Setting max_steps to ",
+                            f"{self.config['optim']['max_steps']} from",
+                            f"max_epochs ({max_epochs}), dataset length",
+                            f"({n_train}), and batch_size ({batch_size})\n",
+                        )
 
             self.samplers[split] = self.get_sampler(
                 self.datasets[split], batch_size, shuffle=shuffle
@@ -498,7 +501,11 @@ def load_optimizer(self):
             )
 
     def load_extras(self):
-        self.scheduler = LRScheduler(self.optimizer, self.config["optim"])
+        self.scheduler = LRScheduler(
+            self.optimizer,
+            self.config["optim"],
+            silent=self.silent,
+        )
         self.clip_grad_norm = self.config["optim"].get("clip_grad_norm")
         self.ema_decay = self.config["optim"].get("ema_decay")
         if self.ema_decay:
diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 289e452f52..d5a3d2a197 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -78,10 +78,11 @@ def load_task(self):
                         device=self.device,
                     )
                 else:
-                    print(
-                        "Warning: grad_target_mean not found in normalizer but",
-                        "regress_forces and normalize_labels are true.",
-                    )
+                    if not self.silent:
+                        print(
+                            "Warning: grad_target_mean not found in normalizer but",
+                            "regress_forces and normalize_labels are true.",
+                        )
                     self.normalizers["grad_target"] = Normalizer(
                         tensor=self.datasets["train"].data.y[
                             self.datasets["train"].__indices__
@@ -431,7 +432,7 @@ def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times)
             for ds in self.datasets.values():
                 ds.close_db()
 
-    def model_forward(self, batch_list):
+    def model_forward(self, batch_list, mode="train"):
         # Distinguish frame averaging from base case.
         if self.config["frame_averaging"] and self.config["frame_averaging"] != "DA":
             original_pos = batch_list[0].pos
@@ -444,7 +445,10 @@ def model_forward(self, batch_list):
                 batch_list[0].pos = batch_list[0].fa_pos[i]
                 if self.task_name in OCP_TASKS:
                     batch_list[0].cell = batch_list[0].fa_cell[i]
-                preds = self.model(deepcopy(batch_list))
+
+                # forward pass
+                preds = self.model(deepcopy(batch_list), mode=mode)
+
                 e_all.append(preds["energy"])
                 if preds.get("pooling_loss") is not None:
                     p_all.append(preds["pooling_loss"])
@@ -460,6 +464,7 @@ def model_forward(self, batch_list):
                         .view(-1, 3)
                     )
                     f_all.append(g_forces)
+
             batch_list[0].pos = original_pos
             if self.task_name in OCP_TASKS:
                 batch_list[0].cell = original_cell

From bf406946709e0eb72cfcaebd4754bb3e85ad3709 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Sun, 22 Jan 2023 20:39:31 -0500
Subject: [PATCH 235/273] update force coefficient search space

---
 configs/exps/icml/qm7x/fanet-orion-v1.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/configs/exps/icml/qm7x/fanet-orion-v1.yaml b/configs/exps/icml/qm7x/fanet-orion-v1.yaml
index 9e8bf2e19a..d7df89b3e6 100644
--- a/configs/exps/icml/qm7x/fanet-orion-v1.yaml
+++ b/configs/exps/icml/qm7x/fanet-orion-v1.yaml
@@ -79,17 +79,16 @@ orion:
   # Remember to change the experiment name if you change anything in the search space
   n_jobs: 50
 
-  unique_exp_name: fanet-qm7x-v1.0.0
+  unique_exp_name: fanet-qm7x-v1.0.1
 
   space:
     optim/max_steps: fidelity(100000, 2000000, base=2)
     optim/lr_initial: loguniform(1e-4, 1e-3, precision=3)
     optim/energy_grad_coefficient: uniform(1, 25, discrete=True)
-    optim/force_coefficient: gaussian(4, 0.5, discrete=True)
+    optim/force_coefficient: uniform(3, 5, discrete=True)
 
     model/complex_mp: choices([True, False])
     model/cutoff: uniform(4.5, 6.5, precision=1)
-    model/edge_embed_type: all_rij
     model/graph_norm: choices([True, False])
     model/hidden_channels: choices([100, 200, 300, 400, 500, 1000])
     model/max_num_neighbors: choices([30, 40, 50])

From feb61c4c3e2f49cd76f833bddd9b82c30a051ef7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 23 Jan 2023 10:20:26 -0500
Subject: [PATCH 236/273] inference timing script

---
 scripts/measure_val_inference_time.py | 111 ++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 scripts/measure_val_inference_time.py

diff --git a/scripts/measure_val_inference_time.py b/scripts/measure_val_inference_time.py
new file mode 100644
index 0000000000..c17fb5a9e3
--- /dev/null
+++ b/scripts/measure_val_inference_time.py
@@ -0,0 +1,111 @@
+import copy
+import sys
+from argparse import Namespace
+from pathlib import Path
+
+import torch
+from minydra import resolved_args
+from tqdm import tqdm
+
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+
+from ocpmodels.common.timer import Times
+from ocpmodels.common.utils import (
+    build_config,
+    move_lmdb_data_to_slurm_tmpdir,
+    resolve,
+    setup_imports,
+)
+from ocpmodels.trainers.single_trainer import SingleTrainer
+
+if __name__ == "__main__":
+    args = resolved_args(
+        defaults={
+            "base_path": "$SCRATCH/ocp/runs",
+            "n_loops": 1,
+            "others": "",
+            "job_ids": "",
+        },
+    ).pretty_print()
+    base = resolve(args.base_path)
+    job_ids = [j.strip() for j in str(args.job_ids).split(",")]
+    paths = [Path(base) / j for j in job_ids if j] + [
+        resolve(p.strip()) for p in args.others.split(",")
+    ]
+    run_dir = resolve("$SCRATCH/ocp/inference_time")
+
+    setup_imports()
+
+    torch.set_grad_enabled(False)
+
+    conf_args = [
+        Namespace(
+            restart_from_dir=str(p),
+            continue_from_dir=None,
+            keep_orion_config=False,
+            run_dir=run_dir / "-".join(job_ids),
+            num_nodes=1,
+            num_gpus=1,
+        )
+        for p in paths
+    ]
+    configs = [
+        build_config(ca, [], silent=True)
+        for ca in tqdm(conf_args, desc="Loading configs".ljust(40))
+    ]
+    configs = [(l, config) for config in configs for l in range(args.n_loops)]
+    names = [
+        f'{config["restart_from_dir"].name}-{config["config"]}' for _, config in configs
+    ]
+
+    times = {}
+
+    for k, (l, config) in enumerate(
+        tqdm(
+            configs,
+            desc=f"Timing {args.n_loops}x{len(conf_args)}={len(configs)} configs".ljust(
+                40
+            ),
+        )
+    ):
+        config["logger"] = "dummy"
+        config["silent"] = True
+
+        od = copy.deepcopy(config["dataset"])
+        for split in od:
+            if split != "default_val" and split != config["dataset"]["default_val"]:
+                del config["dataset"][split]
+        config = move_lmdb_data_to_slurm_tmpdir(config)
+        for split in od:
+            if split != "default_val" and split != config["dataset"]["default_val"]:
+                config["dataset"][split] = od[split]
+
+        if l == 0:
+            trainer = SingleTrainer(**config)
+            timer = Times(gpu=True)
+
+        name = names[k]
+
+        for i, b in enumerate(
+            tqdm(
+                trainer.loaders[trainer.config["dataset"]["default_val"]],
+                desc=f"{name} (loop {l+1}/{args.n_loops})".ljust(40),
+                leave=False,
+            )
+        ):
+            with torch.cuda.amp.autocast(enabled=trainer.scaler is not None):
+                with timer.next("forward"):
+                    _ = trainer.model_forward(b, mode="inference")
+
+        if l == args.n_loops - 1:
+            mean, std = timer.prepare_for_logging(
+                map_func=lambda t: t / trainer.config["optim"]["batch_size"]
+            )
+            times[name] = mean["forward"]
+
+    print(
+        " •  "
+        + "\n •  ".join(
+            f"{k}: {v:.6f} s / sample = {1/v:.2f} samples / s" for k, v in times.items()
+        )
+    )

From f3bdf8b36ab8ceb0fc9de02a00ed8f2eb9cb2e1d Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 23 Jan 2023 12:49:40 -0500
Subject: [PATCH 237/273] fix eval_on_test

---
 ocpmodels/trainers/base_trainer.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 9252b2a9af..1c02706b49 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -78,10 +78,7 @@ def __init__(self, **kwargs):
         self.test_ri = self.config["test_ri"]
         self.is_debug = self.config["is_debug"]
         self.is_hpo = self.config["is_hpo"]
-        if self.task_name == "qm9":
-            self.eval_on_test = self.config["eval_on_test"]
-        else:
-            self.eval_on_test = False
+        self.eval_on_test = bool(self.config.get("eval_on_test"))
         self.silent = self.config["silent"]
         self.datasets = {}
         self.samplers = {}

From cd73b6fb6627596f3ef79b1611eaeae2ff90db72 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 23 Jan 2023 12:59:14 -0500
Subject: [PATCH 238/273] fix checkpoint continue

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 31185919e9..b2867addf6 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -992,9 +992,9 @@ def build_config(args, args_override, silent=False):
             latest_ckpt = str(
                 sorted(ckpts, key=lambda c: float(c.stem.split("-")[-1]))[-1]
             )
+            continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
             if args.continue_from_dir:
                 continue_config["checkpoint"] = str(latest_ckpt)
-            continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
             if not args.keep_orion_config:
                 dels = {}
                 for k in continue_config:

From cb7a488c301a496f9a3a432ce65d69144bfe57a0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 23 Jan 2023 16:40:53 -0500
Subject: [PATCH 239/273] remove detect-anomaly

---
 ocpmodels/models/forcenet.py | 1 -
 ocpmodels/models/sfarinet.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/ocpmodels/models/forcenet.py b/ocpmodels/models/forcenet.py
index ebb9ae4bd1..2a47327923 100644
--- a/ocpmodels/models/forcenet.py
+++ b/ocpmodels/models/forcenet.py
@@ -250,7 +250,6 @@ class ForceNet(BaseModel):
     def __init__(self, **kwargs):
 
         super(ForceNet, self).__init__()
-        torch.autograd.set_detect_anomaly(True)
         self.ablation = kwargs["ablation"]
         self.basis = kwargs["basis"]
         self.cutoff = kwargs["cutoff"]
diff --git a/ocpmodels/models/sfarinet.py b/ocpmodels/models/sfarinet.py
index da3ad2a985..1833b9a2c4 100644
--- a/ocpmodels/models/sfarinet.py
+++ b/ocpmodels/models/sfarinet.py
@@ -329,7 +329,6 @@ class SfariNet(BaseModel):
 
     def __init__(self, **kwargs):
         super().__init__()
-        torch.autograd.set_detect_anomaly(True)
         self.cutoff = kwargs["cutoff"]
         self.use_pbc = kwargs["use_pbc"]
         self.max_num_neighbors = kwargs["max_num_neighbors"]

From 551419c7ce862ab0b33d0d123d6647acdc141117 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 23 Jan 2023 17:48:06 -0500
Subject: [PATCH 240/273] auto long-grace for orion long jobs

---
 launch_exp.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/launch_exp.py b/launch_exp.py
index 4fe5eb1b1c..ca1d5fe964 100644
--- a/launch_exp.py
+++ b/launch_exp.py
@@ -195,12 +195,17 @@ def get_args_or_exp(key, args, exp):
             exp["unique_exp_name"] = unique_exp_name
 
         search_path = orion_base / "search-spaces" / f"{ts}-{unique_exp_name}.yaml"
+        job_dict = {
+            "job_name": unique_exp_name,
+        }
+
+        if (exp["job"].get("partition") or "long") == "long":
+            job_dict["partition"] = "long-grace"
+
         runs = [
             {
                 "orion_exp_config_path": str(search_path),
-                "job": {
-                    "job_name": unique_exp_name,
-                },
+                "job": job_dict,
             }
             for _ in range(n_jobs)
         ]

From a9a98c5d3c03af7dfd48a4ae62f4d1ac779865a0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Mon, 23 Jan 2023 17:52:37 -0500
Subject: [PATCH 241/273] v2

---
 .../exps/icml/s2ef/fanet-orion-s2ef-2.yaml    | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml

diff --git a/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml b/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml
new file mode 100644
index 0000000000..5cab52d455
--- /dev/null
+++ b/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml
@@ -0,0 +1,64 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:1
+  partition: long
+
+default:
+  wandb_project: ocp-3
+  config: fanet-s2ef-2M
+  mode: train
+  test_ri: true
+  wandb_tags: s2ef-2M, orion
+  cp_data_to_tmpdir: true
+  graph_rewiring: remove-tag-0
+  model:
+    edge_embed_type: all_rij
+  frame_averaging: 2D
+  fa_frames: random
+  optim:
+    scheduler: LinearWarmupCosineAnnealingLR
+    force_coefficient: 100
+    energy_coefficient: 1
+    energy_grad_coefficient: 5
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co
+    optim: lr_initial, warmup_steps
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 50
+
+  unique_exp_name: fanet-s2ef-2M-v1.2
+
+  space:
+    model/att_heads: choices([1,2,3,4])
+    model/complex_mp: choices([True, False])
+    model/cutoff: choices([4.0, 6.0, 10.0])
+    model/energy_head: choices(["", "weighted-av-final-embeds", "weighted-av-initial-embeds"])
+    model/graph_norm: choices([True, False])
+    model/hidden_channels: uniform(6, 22, discrete=True)
+    model/mp_type: choices(["simple", "base", "sfarinet", "updownscale", "updownscale_base", "base_with_att", "att", "local_env", "updown_local_env"])
+    model/num_filters: uniform(2, 18, discrete=True)
+    model/num_gaussians: uniform(30, 150, discrete=True)
+    model/num_interactions: uniform(3, 6, discrete=True)
+    model/pg_hidden_channels: uniform(0, 1, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/regress_forces: choices(["direct_with_gradient_target", "direct"])
+    model/second_layer_MLP: choices([True, False])
+    model/skip_co: choices(["add", "concat", False])
+    model/tag_hidden_channels: uniform(0, 2, discrete=True)
+    model/max_num_neighbors: choices([30,40,50])
+    optim/lr_initial: loguniform(5e-5, 5e-4, precision=2)
+    optim/max_epochs: fidelity(6, 22, base=6)
+
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2

From aeebbaec6c76af0f1359d5f4303ec96f28caf2a5 Mon Sep 17 00:00:00 2001
From: AlexDuvalinho <afonsalex@free.fr>
Date: Mon, 23 Jan 2023 18:06:51 -0500
Subject: [PATCH 242/273] new confgis

---
 .../exps/icml/is2re-all/fanet-orion-4.yaml    |   2 +-
 configs/exps/icml/is2re-all/top-config-3.yaml | 246 ++++++++++++++
 .../exps/icml/s2ef/fanet-orion-s2ef-2.yaml    |  65 ++++
 configs/exps/icml/s2ef/top-config.yaml        | 300 ++++++++++++++++++
 configs/models/tasks/s2ef.yaml                |  18 +-
 5 files changed, 621 insertions(+), 10 deletions(-)
 create mode 100644 configs/exps/icml/is2re-all/top-config-3.yaml
 create mode 100644 configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml
 create mode 100644 configs/exps/icml/s2ef/top-config.yaml

diff --git a/configs/exps/icml/is2re-all/fanet-orion-4.yaml b/configs/exps/icml/is2re-all/fanet-orion-4.yaml
index f86ea559f4..dd1c46f035 100644
--- a/configs/exps/icml/is2re-all/fanet-orion-4.yaml
+++ b/configs/exps/icml/is2re-all/fanet-orion-4.yaml
@@ -31,7 +31,7 @@ default:
 
 orion:
   # Remember to change the experiment name if you change anything in the search space
-  n_jobs: 150
+  n_jobs: 25
 
   unique_exp_name: fanet-is2re-all-v4
 
diff --git a/configs/exps/icml/is2re-all/top-config-3.yaml b/configs/exps/icml/is2re-all/top-config-3.yaml
new file mode 100644
index 0000000000..eb18bc2f77
--- /dev/null
+++ b/configs/exps/icml/is2re-all/top-config-3.yaml
@@ -0,0 +1,246 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+  time: 15:00:00
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  model:
+    edge_embed_type: all_rij
+  wandb_tags: 'best-config'
+  optim:
+    batch_size: 256
+    eval_batch_size: 256
+  cp_data_to_tmpdir: True
+
+runs:
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 96
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 288
+      num_gaussians: 68
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 10
+      eval_every: 0.25
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run eval every epoch'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 96
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 288
+      num_gaussians: 68
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 12
+      eval_every: 1
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run eval every epoch'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 96
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 288
+      num_gaussians: 68
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 12
+      eval_every: 1
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 96
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 288
+      num_gaussians: 68
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 14
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 96
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 380
+      num_gaussians: 80
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 12
+      eval_every: 0.25
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 96
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 288
+      num_gaussians: 68
+      num_interactions: 6
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 5.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 10
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 70
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 288
+      num_gaussians: 68
+      num_interactions: 5
+      second_layer_MLP: True
+      skip_co: concat
+      cutoff: 5.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 11
+      eval_every: 0.4
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 96
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 288
+      num_gaussians: 68
+      num_interactions: 5
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 4.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 9
+      eval_every: 0.4
+
+
+  - config: fanet-is2re-all  # 2700544
+    note: 'top-run'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    model:
+      mp_type: updownscale_base
+      phys_embeds: True
+      tag_hidden_channels: 32
+      pg_hidden_channels: 96
+      energy_head: weighted-av-final-embeds
+      complex_mp: True
+      graph_norm: True
+      hidden_channels: 352
+      num_filters: 300
+      num_gaussians: 75
+      num_interactions: 6
+      second_layer_MLP: False
+      skip_co: concat
+      cutoff: 5.0
+    optim: 
+      lr_initial: 0.002
+      scheduler: LinearWarmupCosineAnnealingLR
+      max_epochs: 13
+      eval_every: 0.4
\ No newline at end of file
diff --git a/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml b/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml
new file mode 100644
index 0000000000..7a4b3da0fd
--- /dev/null
+++ b/configs/exps/icml/s2ef/fanet-orion-s2ef-2.yaml
@@ -0,0 +1,65 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+
+default:
+  wandb_project: ocp-3
+  config: fanet-s2ef-2M
+  mode: train
+  test_ri: true
+  wandb_tags: s2ef-2M, orion, v2
+  cp_data_to_tmpdir: true
+  graph_rewiring: remove-tag-0
+  model:
+    edge_embed_type: all_rij
+    graph_norm: True
+  frame_averaging: 2D
+  fa_frames: random
+  optim:
+    scheduler: LinearWarmupCosineAnnealingLR
+    force_coefficient: 100
+    energy_coefficient: 1
+    energy_grad_coefficient: 5
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm, complex_mp, att_heads, second_layer_MLP, skip_co
+    optim: lr_initial, warmup_steps
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 32
+    targets: hidden_channels, num_filters, pg_hidden_channels, phys_hidden_channels, tag_hidden_channels
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 40
+
+  unique_exp_name: fanet-s2ef-2M-v2
+
+  space:
+    model/att_heads: choices([1,2,3,4])
+    model/complex_mp: choices([True, False])
+    model/cutoff: choices([4.0, 6.0, 8.0])
+    model/energy_head: choices(["weighted-av-final-embeds"])
+    model/graph_norm: choices([True, False])
+    model/hidden_channels: uniform(6, 21, discrete=True)
+    model/mp_type: choices(["simple", "base", "updownscale", "updownscale_base", "base_with_att", "updown_local_env"])
+    model/num_filters: uniform(3, 18, discrete=True)
+    model/num_gaussians: uniform(40, 160, discrete=True)
+    model/num_interactions: uniform(3, 7, discrete=True)
+    model/pg_hidden_channels: uniform(0, 3, discrete=True)
+    model/phys_embeds: choices([True, False])
+    model/regress_forces: choices(["direct_with_gradient_target", "direct"])
+    model/second_layer_MLP: choices([True, False])
+    model/skip_co: choices(["concat", False])
+    model/tag_hidden_channels: uniform(0, 2, discrete=True)
+    model/max_num_neighbors: choices([30,40,50])
+    optim/lr_initial: loguniform(9e-5, 5e-4, precision=2)
+    optim/max_epochs: fidelity(8, 22, base=6)
+
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 4
+      num_brackets: 2
diff --git a/configs/exps/icml/s2ef/top-config.yaml b/configs/exps/icml/s2ef/top-config.yaml
new file mode 100644
index 0000000000..4dc6f04800
--- /dev/null
+++ b/configs/exps/icml/s2ef/top-config.yaml
@@ -0,0 +1,300 @@
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:rtx8000:1
+  partition: long
+
+default:
+  test_ri: True
+  mode: train
+  graph_rewiring: remove-tag-0
+  cp_data_to_tmp: true
+  model:
+    energy_head: 'weighted-av-final-embeds' # False ?  frame_averaging: 2D
+  fa_frames: random
+  frame_averaging: 2D
+  wandb_tags: 's2ef-top-config'
+  optim:
+    batch_size: 192
+    eval_batch_size: 192
+    scheduler: LinearWarmupCosineAnnealingLR
+    force_coefficient: 100
+    energy_coefficient: 1
+    energy_grad_coefficient: 5
+
+runs:
+  - config: sfarinet-s2ef-2M
+    note: 'top-config'
+    fa_fames: random
+    optim:
+      lr_initial: 0.00022
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 40
+      graph_norm: False
+      mp_type: updownscale
+      hidden_channels: 480
+      num_gaussians: 145
+      num_filters: 384
+      num_interactions: 4
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: False
+      skip_co: "concat"
+      second_layer_mlp: False
+      complex_mp: True
+      second_layer_MLP: True
+
+  - config: sfarinet-s2ef-2M
+    note: 'DA'
+    frame_averaging: DA
+    optim:
+      lr_initial: 0.00022
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 40
+      graph_norm: False
+      mp_type: updownscale
+      hidden_channels: 480
+      num_gaussians: 145
+      num_filters: 384
+      num_interactions: 4
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: False
+      skip_co: "concat"
+      complex_mp: False
+      second_layer_MLP: True
+
+  - config: sfarinet-s2ef-2M
+    note: 'bigger se3-random'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    optim:
+      lr_initial: 0.00022
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 40
+      graph_norm: True
+      mp_type: updownscale
+      hidden_channels: 480
+      num_gaussians: 145
+      num_filters: 420
+      num_interactions: 4
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: True
+      skip_co: "concat"
+      complex_mp: True
+      second_layer_MLP: True
+
+  - config: sfarinet-s2ef-2M
+    note: 'force only'
+    frame_averaging: 2D
+    fa_frames: se3-random
+    optim:
+      lr_initial: 0.00022
+      max_epochs: 22
+      force_coefficient: 100
+      energy_coefficient: 0
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 40
+      graph_norm: False
+      mp_type: updownscale
+      hidden_channels: 480
+      num_gaussians: 145
+      num_filters: 420
+      num_interactions: 4
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: True
+      skip_co: "concat"
+      complex_mp: True
+      second_layer_MLP: True
+
+  - config: sfarinet-s2ef-2M
+    note: 'top-config'
+    frame_averaging: 2D
+    fa_frames: random
+    optim:
+      lr_initial: 0.00022
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 30
+      graph_norm: True
+      mp_type: updownscale_base
+      hidden_channels: 256
+      num_gaussians: 128
+      num_filters: 480
+      num_interactions: 7
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: True
+      skip_co: "concat"
+      complex_mp: True
+      second_layer_MLP: False
+
+  - config: sfarinet-s2ef-2M
+    note: 'top-config'
+    frame_averaging: 2D
+    fa_frames: random
+    optim:
+      lr_initial: 0.00027
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 30
+      graph_norm: True
+      mp_type: updownscale_base
+      hidden_channels: 456
+      num_gaussians: 128
+      num_filters: 600
+      num_interactions: 7
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: True
+      skip_co: "concat"
+      complex_mp: True
+      second_layer_MLP: False
+
+  - config: sfarinet-s2ef-2M
+    note: 'top-config'
+    frame_averaging: 2D
+    fa_frames: random
+    optim:
+      lr_initial: 0.00027
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 30
+      graph_norm: True
+      mp_type: updownscale_base
+      hidden_channels: 456
+      num_gaussians: 128
+      num_filters: 600
+      num_interactions: 7
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: True
+      skip_co: "concat"
+      complex_mp: True
+      second_layer_MLP: False
+
+
+  - config: sfarinet-s2ef-2M
+    note: 'top-config'
+    frame_averaging: 2D
+    fa_frames: random
+    optim:
+      lr_initial: 0.00023
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 50
+      graph_norm: True
+      mp_type: base
+      hidden_channels: 352
+      num_gaussians: 99
+      num_filters: 480
+      num_interactions: 4
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: False
+      skip_co: "concat"
+      complex_mp: False
+      second_layer_MLP: False
+
+  - config: sfarinet-s2ef-2M
+    note: ''
+    frame_averaging: 2D
+    fa_frames: random
+    optim:
+      lr_initial: 0.00023
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct
+      cutoff: 6
+      max_num_neighbors: 50
+      graph_norm: True
+      mp_type: base
+      hidden_channels: 352
+      num_gaussians: 99
+      num_filters: 480
+      num_interactions: 4
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: False
+      skip_co: "concat"
+      complex_mp: False
+      second_layer_MLP: False
+
+  - config: sfarinet-s2ef-2M
+    note: 'all'
+    frame_averaging: 2D
+    fa_frames: all
+    optim:
+      lr_initial: 0.00023
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: direct_with_gradient_target
+      cutoff: 6
+      max_num_neighbors: 50
+      graph_norm: True
+      mp_type: base
+      hidden_channels: 352
+      num_gaussians: 99
+      num_filters: 480
+      num_interactions: 4
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: False
+      skip_co: "concat"
+      complex_mp: True
+      second_layer_MLP: True
+
+  - config: sfarinet-s2ef-2M
+    note: 'from_energy'
+    frame_averaging: 2D
+    fa_frames: random
+    optim:
+      lr_initial: 0.00023
+      max_epochs: 22
+      force_coefficient: 100
+    model: 
+      regress_forces: from_energy
+      cutoff: 6
+      max_num_neighbors: 30
+      graph_norm: True
+      mp_type: base
+      hidden_channels: 352
+      num_gaussians: 99
+      num_filters: 480
+      num_interactions: 4
+      pg_hidden_channels: 64
+      tag_hidden_channels: 64
+      phys_embeds: False
+      skip_co: "concat"
+      complex_mp: False
+      second_layer_MLP: True
\ No newline at end of file
diff --git a/configs/models/tasks/s2ef.yaml b/configs/models/tasks/s2ef.yaml
index 4916788b07..ef62591945 100644
--- a/configs/models/tasks/s2ef.yaml
+++ b/configs/models/tasks/s2ef.yaml
@@ -22,37 +22,37 @@ default:
   dataset:
     default_val: val_id
     train:
-      src: /network/projects/_groups/ocp/oc20/s2ef/200k/train
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/200k/train/
       normalize_labels: True
       target_mean: -0.7554450631141663
       target_std: 2.887317180633545
       grad_target_mean: 0.0
       grad_target_std: 2.887317180633545
     val_id:
-      src: /network/projects/_groups/ocp/oc20/s2ef/all/val_id
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/val_id
     val_ood_cat:
-      src: /network/projects/_groups/ocp/oc20/s2ef/all/val_ood_cat
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/val_ood_cat
     val_ood_ads:
-      src: /network/projects/_groups/ocp/oc20/s2ef/all/val_ood_ads
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/val_ood_ads
     val_ood_both:
-      src: /network/projects/_groups/ocp/oc20/s2ef/all/val_ood_both
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/val_ood_both
 
 200k:
   dataset:
     train:
-      src: /network/projects/_groups/ocp/oc20/s2ef/200k/train
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/200k/train/
 
 2M:
   dataset:
     train:
-      src: /network/projects/_groups/ocp/oc20/s2ef/2M/train/
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/2M/train/
 
 20M:
   dataset:
     train:
-      src: /network/projects/_groups/ocp/oc20/s2ef/20M/train/
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/20M/train/
 
 all:
   dataset:
     train:
-      src: /network/projects/_groups/ocp/oc20/s2ef/all/train/
+      src: /network/scratch/s/schmidtv/ocp/datasets/ocp/s2ef/all/train/

From 5c35196b02d5959f87d8b06f87421eb79ebeb9ba Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 11:20:28 -0500
Subject: [PATCH 243/273] `#SBATCH --tmp=800GB`

---
 .../icml/qm9/fanet-best-v6-all-targets.yaml   | 379 ++++++++++++++++++
 sbatch.py                                     |   1 +
 2 files changed, 380 insertions(+)
 create mode 100644 configs/exps/icml/qm9/fanet-best-v6-all-targets.yaml

diff --git a/configs/exps/icml/qm9/fanet-best-v6-all-targets.yaml b/configs/exps/icml/qm9/fanet-best-v6-all-targets.yaml
new file mode 100644
index 0000000000..661a007a73
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-best-v6-all-targets.yaml
@@ -0,0 +1,379 @@
+# scheduler reduce lr on plateau
+job:
+  mem: 12GB
+  cpus: 4
+  gres: gpu:16gb:1
+  partition: long
+  code_loc: /home/mila/s/schmidtv/ocp-project/run-repos/ocp-2
+  # dev: true
+  # verbose: true
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, fanet-qm9-lse
+  log_train_every: 200
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, energy_head, mp_type
+    optim: batch_size, lr_initial
+  frame_averaging: 3D
+  fa_frames: random
+  dataset:
+    train:
+      lse_shift: true
+    val:
+      lse_shift: true
+    test:
+      lse_shift: true
+  optim:
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    batch_size: 64
+    initial_lr: 0.0003
+    max_epochs: 1500
+    loss_energy: mae
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.000001
+    es_warmup_epochs: 600
+    # all below is for the ReduceLROnPlateau scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.9
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+    patience: 15
+  model:
+    complex_mp: true
+    cutoff: 6.0
+    edge_embed_type: all_rij
+    energy_head: ""
+    graph_norm: True
+    hidden_channels: 400
+    max_num_neighbors: 30
+    mp_type: updownscale_base
+    num_filters: 480
+    num_gaussians: 100
+    num_interactions: 5
+    otf_graph: false
+    pg_hidden_channels: 32
+    phys_embeds: false
+    phys_hidden_channels: 0
+    regress_forces: ""
+    second_layer_MLP: true
+    skip_co: true
+    tag_hidden_channels: 0
+    use_pbc: false
+
+
+runs:
+  - dataset:
+      train:
+        target: 0
+      val:
+        target: 0
+      test:
+        target: 0
+  - dataset:
+      train:
+        target: 1
+      val:
+        target: 1
+      test:
+        target: 1
+  - dataset:
+      train:
+        target: 2
+      val:
+        target: 2
+      test:
+        target: 2
+  - dataset:
+      train:
+        target: 3
+      val:
+        target: 3
+      test:
+        target: 3
+  - dataset:
+      train:
+        target: 4
+      val:
+        target: 4
+      test:
+        target: 4
+  - dataset:
+      train:
+        target: 5
+      val:
+        target: 5
+      test:
+        target: 5
+  - dataset:
+      train:
+        target: 6
+      val:
+        target: 6
+      test:
+        target: 6
+  - dataset:
+      train:
+        target: 7
+      val:
+        target: 7
+      test:
+        target: 7
+  - dataset:
+      train:
+        target: 8
+      val:
+        target: 8
+      test:
+        target: 8
+  - dataset:
+      train:
+        target: 9
+      val:
+        target: 9
+      test:
+        target: 9
+  - dataset:
+      train:
+        target: 10
+      val:
+        target: 10
+      test:
+        target: 10
+  - dataset:
+      train:
+        target: 11
+      val:
+        target: 11
+      test:
+        target: 11
+  - dataset:
+      train:
+        target: 12
+      val:
+        target: 12
+      test:
+        target: 12
+  - dataset:
+      train:
+        target: 13
+      val:
+        target: 13
+      test:
+        target: 13
+  - dataset:
+      train:
+        target: 14
+      val:
+        target: 14
+      test:
+        target: 14
+  - dataset:
+      train:
+        target: 15
+      val:
+        target: 15
+      test:
+        target: 15
+  - dataset:
+      train:
+        target: 16
+      val:
+        target: 16
+      test:
+        target: 16
+  - dataset:
+      train:
+        target: 17
+      val:
+        target: 17
+      test:
+        target: 17
+  - dataset:
+      train:
+        target: 18
+      val:
+        target: 18
+      test:
+        target: 18
+
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 0
+      val:
+        target: 0
+      test:
+        target: 0
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 1
+      val:
+        target: 1
+      test:
+        target: 1
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 2
+      val:
+        target: 2
+      test:
+        target: 2
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 3
+      val:
+        target: 3
+      test:
+        target: 3
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 4
+      val:
+        target: 4
+      test:
+        target: 4
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 5
+      val:
+        target: 5
+      test:
+        target: 5
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 6
+      val:
+        target: 6
+      test:
+        target: 6
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 7
+      val:
+        target: 7
+      test:
+        target: 7
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 8
+      val:
+        target: 8
+      test:
+        target: 8
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 9
+      val:
+        target: 9
+      test:
+        target: 9
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 10
+      val:
+        target: 10
+      test:
+        target: 10
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 11
+      val:
+        target: 11
+      test:
+        target: 11
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 12
+      val:
+        target: 12
+      test:
+        target: 12
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 13
+      val:
+        target: 13
+      test:
+        target: 13
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 14
+      val:
+        target: 14
+      test:
+        target: 14
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 15
+      val:
+        target: 15
+      test:
+        target: 15
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 16
+      val:
+        target: 16
+      test:
+        target: 16
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 17
+      val:
+        target: 17
+      test:
+        target: 17
+  - optim:
+      loss_energy: mse
+    dataset:
+      train:
+        target: 18
+      val:
+        target: 18
+      test:
+        target: 18
diff --git a/sbatch.py b/sbatch.py
index b7a4d174d7..6d35dfcdb6 100644
--- a/sbatch.py
+++ b/sbatch.py
@@ -58,6 +58,7 @@ def make_sbatch_params(params):
     for k, v in params.items():
         if v:
             sps.append(f"#SBATCH --{k}={v}")
+    sps.append("#SBATCH --tmp=800GB")
     return "\n".join(sps) + "\n"
 
 
From 22ea5fc8ebb8f6a3ec52bc47573e99f058baa8c6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:03:27 -0500
Subject: [PATCH 244/273] update qm7x drac paths

---
 configs/models/tasks/_drac.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/configs/models/tasks/_drac.yaml b/configs/models/tasks/_drac.yaml
index bbfa6a9847..ac233b5bf1 100644
--- a/configs/models/tasks/_drac.yaml
+++ b/configs/models/tasks/_drac.yaml
@@ -95,3 +95,12 @@ qm9:
       src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
     test:
       src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm9
+
+qm7x:
+  all:
+    train:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed
+    val:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed
+    test:
+      src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed

From 5ab284d1886febc84aa345d0a90cc3217e9d1d5f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:07:26 -0500
Subject: [PATCH 245/273] fix val_id key

---
 configs/models/tasks/_drac.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/models/tasks/_drac.yaml b/configs/models/tasks/_drac.yaml
index ac233b5bf1..d104f5f5b4 100644
--- a/configs/models/tasks/_drac.yaml
+++ b/configs/models/tasks/_drac.yaml
@@ -100,7 +100,7 @@ qm7x:
   all:
     train:
       src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed
-    val:
+    val_id:
       src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed
     test:
       src: /home/vsch/projects/rrg-bengioy-ad/vsch/ocp-data/qm7x/processed

From 64a61da5112a23b30ffa6bbfc187749a53d88832 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:18:40 -0500
Subject: [PATCH 246/273] narval dpp qm7x

---
 configs/exps/icml/qm7x/dpp-v1.yaml | 80 ++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 configs/exps/icml/qm7x/dpp-v1.yaml

diff --git a/configs/exps/icml/qm7x/dpp-v1.yaml b/configs/exps/icml/qm7x/dpp-v1.yaml
new file mode 100644
index 0000000000..dc663d6870
--- /dev/null
+++ b/configs/exps/icml/qm7x/dpp-v1.yaml
@@ -0,0 +1,80 @@
+# trainset has 4068193 samples
+job:
+  mem: 32GB
+  cpus: 6
+  gres: gpu:1
+  time: 24:00:00
+
+default:
+  config: dpp-qm7x-all
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces
+    optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
+  log_train_every: 250
+  energy_head: False
+  frame_averaging: 3D
+  fa_frames: random
+  optim:
+    batch_size: 100
+    max_steps: 2000000
+    warmup_steps: 3000
+    lr_initial: 0.00025
+    eval_every: 0.201
+    energy_coefficient: 1
+    energy_grad_coefficient: 0
+    force_coefficient: 100
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mae
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+  model:
+    act: swish
+    basis_emb_size: 8
+    cutoff: 6.0
+    energy_head: false
+    envelope_exponent: 5
+    graph_rewiring: ''
+    hidden_channels: 256
+    int_emb_size: 64
+    max_num_neighbors: 40
+    num_after_skip: 2
+    num_before_skip: 1
+    num_blocks: 3
+    num_output_layers: 3
+    num_radial: 6
+    num_spherical: 7
+    otf_graph: false
+    out_emb_channels: 192
+    pg_hidden_channels: 32
+    phys_embeds: false
+    phys_hidden_channels: 0
+    regress_forces: 'from_energy'
+    tag_hidden_channels: 0
+    use_pbc: false
+  dataset:
+    train:
+      rescale_with_hof: False
+      lse_shift: True
+    val_id:
+      lse_shift: True
+    test:
+      lse_shift: True
+
+runs:
+ - {}
+ - model:
+    num_blocks: 6
\ No newline at end of file

From 37101f684c7caa1ecff8bc0239181b0aff3e8557 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:21:37 -0500
Subject: [PATCH 247/273] no FA for dpp

---
 configs/exps/icml/qm7x/dpp-v1.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/exps/icml/qm7x/dpp-v1.yaml b/configs/exps/icml/qm7x/dpp-v1.yaml
index dc663d6870..45651d6ecf 100644
--- a/configs/exps/icml/qm7x/dpp-v1.yaml
+++ b/configs/exps/icml/qm7x/dpp-v1.yaml
@@ -18,14 +18,14 @@ default:
     optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
   log_train_every: 250
   energy_head: False
-  frame_averaging: 3D
-  fa_frames: random
+  frame_averaging: ""
+  fa_frames: ""
   optim:
     batch_size: 100
     max_steps: 2000000
     warmup_steps: 3000
     lr_initial: 0.00025
-    eval_every: 0.201
+    eval_every: 1
     energy_coefficient: 1
     energy_grad_coefficient: 0
     force_coefficient: 100

From 78d19f3aa99ee953eabc78c3cc7167b482498ac3 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:43:35 -0500
Subject: [PATCH 248/273] schnet dpp qm9

---
 configs/exps/icml/qm9/schnet-dpp.yaml | 63 +++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 configs/exps/icml/qm9/schnet-dpp.yaml

diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml
new file mode 100644
index 0000000000..464ba0cb05
--- /dev/null
+++ b/configs/exps/icml/qm9/schnet-dpp.yaml
@@ -0,0 +1,63 @@
+# trainset has 4068193 samples
+job:
+  mem: 32GB
+  cpus: 4
+  gres: gpu:16gb:1
+  time: 06:00:00
+
+default:
+  wandb_project: ocp-qm
+  mode: train
+  test_ri: true
+  wandb_tags: qm7x
+  cp_data_to_tmpdir: true
+  note:
+    task: name
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces
+    optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
+  log_train_every: 250
+  energy_head: False
+  frame_averaging: ""
+  fa_frames: ""
+  optim:
+    batch_size: 100
+    max_steps: 2000000
+    warmup_steps: 3000
+    lr_initial: 0.00025
+    eval_every: 1
+    energy_coefficient: 1
+    energy_grad_coefficient: 0
+    force_coefficient: 100
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mae
+    loss_force: mse
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.75
+    threshold: 0.001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+  dataset:
+    train:
+      rescale_with_hof: False
+      lse_shift: True
+    val_id:
+      lse_shift: True
+    test:
+      lse_shift: True
+
+runs:
+ - config: dpp-qm9-all
+ - config: schnet-qm9-all
+   model:
+    cutoff: 5
+    hidden_channels: 128
+    max_num_neighbors: 40
+    num_filters: 128
+    num_gaussians: 100
+    num_interactions: 6
+    pg_hidden_channels: 0
+    phys_embeds: false
\ No newline at end of file

From 0276b3a89605d524fd5d71e6709b5d3a8ea03e37 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:56:06 -0500
Subject: [PATCH 249/273] pop checkpoint

---
 configs/exps/icml/qm9/schnet-dpp.yaml | 21 +++++++++++++++++----
 ocpmodels/common/utils.py             |  2 ++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml
index 464ba0cb05..e9613f1358 100644
--- a/configs/exps/icml/qm9/schnet-dpp.yaml
+++ b/configs/exps/icml/qm9/schnet-dpp.yaml
@@ -3,7 +3,7 @@ job:
   mem: 32GB
   cpus: 4
   gres: gpu:16gb:1
-  time: 06:00:00
+  time: "12:00:00"
 
 default:
   wandb_project: ocp-qm
@@ -50,9 +50,22 @@ default:
       lse_shift: True
 
 runs:
- - config: dpp-qm9-all
- - config: schnet-qm9-all
-   model:
+- config: dpp-qm9-all
+    model:
+      cutoff: 5
+      num_spherical: 7
+      num_radial: 6
+      envelope_exponent: 5
+      num_before_skip: 1
+      num_after_skip: 2
+      num_dense_output: 3
+    optim:
+      batch_size: 32
+      lr_initial: 0.001
+      warmup_steps: 3000
+
+- config: schnet-qm9-all
+  model:
     cutoff: 5
     hidden_channels: 128
     max_num_neighbors: 40
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index b2867addf6..5e81d3a78b 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -995,6 +995,8 @@ def build_config(args, args_override, silent=False):
             continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
             if args.continue_from_dir:
                 continue_config["checkpoint"] = str(latest_ckpt)
+            else:
+                continue_config.pop("checkpoint", None)
             if not args.keep_orion_config:
                 dels = {}
                 for k in continue_config:

From 3bf7f887bb45c5c10a4d4589951b6d6ac24e4862 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:56:56 -0500
Subject: [PATCH 250/273] drac version

---
 configs/exps/icml/qm9/schnet-dpp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml
index e9613f1358..28a26648c7 100644
--- a/configs/exps/icml/qm9/schnet-dpp.yaml
+++ b/configs/exps/icml/qm9/schnet-dpp.yaml
@@ -2,7 +2,7 @@
 job:
   mem: 32GB
   cpus: 4
-  gres: gpu:16gb:1
+  gres: gpu:1
   time: "12:00:00"
 
 default:

From 6372e53acf2e4b8ee0f58dfe14bd7a2e2cf4390d Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:58:45 -0500
Subject: [PATCH 251/273] indent

---
 configs/exps/icml/qm9/schnet-dpp.yaml | 44 +++++++++++++--------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml
index 28a26648c7..bb8d7ac80a 100644
--- a/configs/exps/icml/qm9/schnet-dpp.yaml
+++ b/configs/exps/icml/qm9/schnet-dpp.yaml
@@ -50,27 +50,27 @@ default:
       lse_shift: True
 
 runs:
-- config: dpp-qm9-all
+  - config: dpp-qm9-all
+      model:
+        cutoff: 5
+        num_spherical: 7
+        num_radial: 6
+        envelope_exponent: 5
+        num_before_skip: 1
+        num_after_skip: 2
+        num_dense_output: 3
+      optim:
+        batch_size: 32
+        lr_initial: 0.001
+        warmup_steps: 3000
+
+  - config: schnet-qm9-all
     model:
       cutoff: 5
-      num_spherical: 7
-      num_radial: 6
-      envelope_exponent: 5
-      num_before_skip: 1
-      num_after_skip: 2
-      num_dense_output: 3
-    optim:
-      batch_size: 32
-      lr_initial: 0.001
-      warmup_steps: 3000
-
-- config: schnet-qm9-all
-  model:
-    cutoff: 5
-    hidden_channels: 128
-    max_num_neighbors: 40
-    num_filters: 128
-    num_gaussians: 100
-    num_interactions: 6
-    pg_hidden_channels: 0
-    phys_embeds: false
\ No newline at end of file
+      hidden_channels: 128
+      max_num_neighbors: 40
+      num_filters: 128
+      num_gaussians: 100
+      num_interactions: 6
+      pg_hidden_channels: 0
+      phys_embeds: false
\ No newline at end of file

From b852a102ff613d3682eb9d73795624f0aef6ddb7 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:59:14 -0500
Subject: [PATCH 252/273] typo

---
 configs/exps/icml/qm9/schnet-dpp.yaml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml
index bb8d7ac80a..aa9987e216 100644
--- a/configs/exps/icml/qm9/schnet-dpp.yaml
+++ b/configs/exps/icml/qm9/schnet-dpp.yaml
@@ -51,18 +51,18 @@ default:
 
 runs:
   - config: dpp-qm9-all
-      model:
-        cutoff: 5
-        num_spherical: 7
-        num_radial: 6
-        envelope_exponent: 5
-        num_before_skip: 1
-        num_after_skip: 2
-        num_dense_output: 3
-      optim:
-        batch_size: 32
-        lr_initial: 0.001
-        warmup_steps: 3000
+    model:
+      cutoff: 5
+      num_spherical: 7
+      num_radial: 6
+      envelope_exponent: 5
+      num_before_skip: 1
+      num_after_skip: 2
+      num_dense_output: 3
+    optim:
+      batch_size: 32
+      lr_initial: 0.001
+      warmup_steps: 3000
 
   - config: schnet-qm9-all
     model:

From de5fe57e25f9e8ad80ad13646e718c91bdb074c0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 16:59:51 -0500
Subject: [PATCH 253/273] update wandb tag

---
 configs/exps/icml/qm9/schnet-dpp.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml
index aa9987e216..cf161b9fe5 100644
--- a/configs/exps/icml/qm9/schnet-dpp.yaml
+++ b/configs/exps/icml/qm9/schnet-dpp.yaml
@@ -9,7 +9,7 @@ default:
   wandb_project: ocp-qm
   mode: train
   test_ri: true
-  wandb_tags: qm7x
+  wandb_tags: qm9
   cp_data_to_tmpdir: true
   note:
     task: name

From 84ef5a6955bf9c117ec639d4deffc315c80defc3 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 17:10:19 -0500
Subject: [PATCH 254/273] tyupo in dataset

---
 configs/exps/icml/qm9/schnet-dpp.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml
index cf161b9fe5..0d11460331 100644
--- a/configs/exps/icml/qm9/schnet-dpp.yaml
+++ b/configs/exps/icml/qm9/schnet-dpp.yaml
@@ -2,8 +2,8 @@
 job:
   mem: 32GB
   cpus: 4
-  gres: gpu:1
-  time: "12:00:00"
+  gres: gpu:16gb:1
+  # time: "12:00:00"
 
 default:
   wandb_project: ocp-qm
@@ -44,7 +44,7 @@ default:
     train:
       rescale_with_hof: False
       lse_shift: True
-    val_id:
+    val:
       lse_shift: True
     test:
       lse_shift: True

From 8d7509e82f3494b89d19dd2d30a9c0f803d17e93 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Tue, 24 Jan 2023 20:12:34 -0500
Subject: [PATCH 255/273] delete wandb resume id

---
 ocpmodels/common/utils.py          | 2 ++
 ocpmodels/trainers/base_trainer.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 5e81d3a78b..f2e09c762a 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -995,8 +995,10 @@ def build_config(args, args_override, silent=False):
             continue_config = torch.load((latest_ckpt), map_location="cpu")["config"]
             if args.continue_from_dir:
                 continue_config["checkpoint"] = str(latest_ckpt)
+                continue_config["job_ids"] = continue_config["job_ids"] + f", {JOB_ID}"
             else:
                 continue_config.pop("checkpoint", None)
+                continue_config.pop("wandb_resume_id", None)
             if not args.keep_orion_config:
                 dels = {}
                 for k in continue_config:
diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 1c02706b49..13252ad414 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -433,7 +433,7 @@ def load_checkpoint(self, checkpoint_path):
                 self.scaler.load_state_dict(checkpoint["amp"])
 
         if "config" in checkpoint:
-            if "job_ids" in checkpoint["config"]:
+            if "job_ids" in checkpoint["config"] and JOB_ID not in checkpoint["config"]:
                 self.config["job_ids"] = checkpoint["config"]["job_ids"] + f", {JOB_ID}"
 
     def load_loss(self):

From 4c4b9ec2f18c180a5c8796d4488c61a73e158559 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 25 Jan 2023 00:41:44 -0500
Subject: [PATCH 256/273] nex exps

---
 configs/exps/icml/qm7x/dpp-v1.yaml            | 17 ++---
 configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml | 75 +++++++++++++++++++
 configs/exps/icml/qm9/schnet-dpp.yaml         | 58 +++++++++-----
 configs/models/tasks/qm9.yaml                 |  8 +-
 4 files changed, 127 insertions(+), 31 deletions(-)
 create mode 100644 configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml

diff --git a/configs/exps/icml/qm7x/dpp-v1.yaml b/configs/exps/icml/qm7x/dpp-v1.yaml
index 45651d6ecf..eb658eb136 100644
--- a/configs/exps/icml/qm7x/dpp-v1.yaml
+++ b/configs/exps/icml/qm7x/dpp-v1.yaml
@@ -1,9 +1,8 @@
 # trainset has 4068193 samples
 job:
-  mem: 32GB
-  cpus: 6
-  gres: gpu:1
-  time: 24:00:00
+  mem: 24GB
+  cpus: 5
+  gres: gpu:24gb:1
 
 default:
   config: dpp-qm7x-all
@@ -12,10 +11,6 @@ default:
   test_ri: true
   wandb_tags: qm7x
   cp_data_to_tmpdir: true
-  note:
-    task: name
-    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces
-    optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
   log_train_every: 250
   energy_head: False
   frame_averaging: ""
@@ -25,7 +20,7 @@ default:
     max_steps: 2000000
     warmup_steps: 3000
     lr_initial: 0.00025
-    eval_every: 1
+    eval_every: 0
     energy_coefficient: 1
     energy_grad_coefficient: 0
     force_coefficient: 100
@@ -77,4 +72,6 @@ default:
 runs:
  - {}
  - model:
-    num_blocks: 6
\ No newline at end of file
+    optim:
+      batch_size: 32
+    num_blocks: 4
\ No newline at end of file
diff --git a/configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml b/configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml
new file mode 100644
index 0000000000..afc11c43fb
--- /dev/null
+++ b/configs/exps/icml/qm9/fanet-orion-qm9-v7.yaml
@@ -0,0 +1,75 @@
+# more epochs, larger batch size, explore fanet: larger model & skip-co & mlp_rij
+job:
+  mem: 8GB
+  cpus: 4
+  gres: gpu:1
+  time: 02:55:00
+  partition: long
+
+default:
+  wandb_project: ocp-qm
+  config: fanet-qm9-all
+  mode: train
+  test_ri: true
+  wandb_tags: qm9, orion
+  log_train_every: 200
+  optim:
+    batch_size: 64
+    warmup_steps: 3000
+    # parameters EMA
+    ema_decay: 0.999
+    loss_energy: mse
+    # early stopping
+    es_patience: 20
+    es_min_abs_change: 0.000001
+    es_warmup_epochs: 650
+    # all below is for the scheduler
+    scheduler: ReduceLROnPlateau
+    mode: min
+    factor: 0.95
+    threshold: 0.0001
+    threshold_mode: abs
+    min_lr: 0.000001
+    verbose: true
+    patience: 10
+  note:
+    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, phys_embeds, pg_hidden_channels, phys_hidden_channels, energy_head, edge_embed_type, mp_type, graph_norm
+    optim: batch_size, lr_initial
+    _root_: frame_averaging, fa_frames
+  orion_mult_factor:
+    value: 25
+    targets: num_filters, hidden_channels, num_gaussians
+  frame_averaging: 3D
+  fa_frames: random
+  model:
+    mp_type: updownscale_base
+    edge_embed_type: all_rij
+    energy_head: ""
+    num_gaussians: 100
+    pg_hidden_channels: 32
+    phys_embeds: True
+    second_layer_MLP: True
+    skip_co: True
+    complex_mp: True
+    graph_norm: True
+
+orion:
+  # Remember to change the experiment name if you change anything in the search space
+  n_jobs: 30
+
+  unique_exp_name: fanet-qm9-v7.0.0
+
+  space:
+    optim/max_epochs: fidelity(650, 1000, base=8)
+    optim/lr_initial: loguniform(1e-4, 1e-3, precision=3)
+    model/cutoff: uniform(4.5, 6.5, precision=1)
+    model/hidden_channels: uniform(10, 20, discrete=True)
+    model/max_num_neighbors: choices([30, 40, 50])
+    model/num_gaussians: choices([50, 100, 150])
+    model/num_filters: uniform(10, 20, discrete=True)
+    model/num_interactions: uniform(3, 6, discrete=True)
+  algorithms:
+    asha:
+      seed: 123
+      num_rungs: 3
+      num_brackets: 2
diff --git a/configs/exps/icml/qm9/schnet-dpp.yaml b/configs/exps/icml/qm9/schnet-dpp.yaml
index 0d11460331..33683721bc 100644
--- a/configs/exps/icml/qm9/schnet-dpp.yaml
+++ b/configs/exps/icml/qm9/schnet-dpp.yaml
@@ -1,8 +1,8 @@
 # trainset has 4068193 samples
 job:
   mem: 32GB
-  cpus: 4
-  gres: gpu:16gb:1
+  cpus: 6
+  gres: gpu:rtx8000:1
   # time: "12:00:00"
 
 default:
@@ -11,10 +11,7 @@ default:
   test_ri: true
   wandb_tags: qm9
   cp_data_to_tmpdir: true
-  note:
-    task: name
-    model: name, num_gaussians, hidden_channels, num_filters, num_interactions, regress_forces
-    optim: batch_size, lr_initial, energy_coefficient, force_coefficient, energy_grad_coefficient
+  note: "qm9 dpp schnet baselines"
   log_train_every: 250
   energy_head: False
   frame_averaging: ""
@@ -24,7 +21,7 @@ default:
     max_steps: 2000000
     warmup_steps: 3000
     lr_initial: 0.00025
-    eval_every: 1
+    eval_every: 0 # 0 is n_train
     energy_coefficient: 1
     energy_grad_coefficient: 0
     force_coefficient: 100
@@ -50,6 +47,7 @@ default:
       lse_shift: True
 
 runs:
+  # # https://github.com/gasteigerjo/dimenet/blob/master/config.yaml
   - config: dpp-qm9-all
     model:
       cutoff: 5
@@ -64,13 +62,39 @@ runs:
       lr_initial: 0.001
       warmup_steps: 3000
 
-  - config: schnet-qm9-all
-    model:
-      cutoff: 5
-      hidden_channels: 128
-      max_num_neighbors: 40
-      num_filters: 128
-      num_gaussians: 100
-      num_interactions: 6
-      pg_hidden_channels: 0
-      phys_embeds: false
\ No newline at end of file
+  # #
+  # - config: schnet-qm9-all
+  #   model:
+  #     cutoff: 5
+  #     hidden_channels: 128
+  #     max_num_neighbors: 40
+  #     num_filters: 128
+  #     num_gaussians: 100
+  #     num_interactions: 6
+  #     pg_hidden_channels: 0
+  #     phys_embeds: false
+  # https://github.com/atomistic-machine-learning/SchNet/blob/master/scripts/train_energy_force.py#L149
+  # - config: schnet-qm9-all
+  #   optim:
+  #     batch_size: 32
+  #   model:
+  #     cutoff: 20
+  #     num_interactions: 6
+  #     num_gaussians: 64
+  #     num_filters: 64
+  #     hidden_channels: 128
+  #     pg_hidden_channels: 0
+  #     phys_embeds: false
+  # # https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/nn/models/schnet.html#SchNet
+  # - config: schnet-qm9-all
+  #   optim:
+  #     batch_size: 32
+  #   model:
+  #     max_num_neighbors: 32
+  #     cutoff: 10
+  #     num_gaussians: 50
+  #     num_interactions: 6
+  #     num_filters: 128
+  #     hidden_channels: 128
+  #     pg_hidden_channels: 0
+  #     phys_embeds: false
diff --git a/configs/models/tasks/qm9.yaml b/configs/models/tasks/qm9.yaml
index ecdc1d1dac..b13954b393 100644
--- a/configs/models/tasks/qm9.yaml
+++ b/configs/models/tasks/qm9.yaml
@@ -21,13 +21,13 @@ default:
       - internal energy at 0K
 
   normalizer: null
-
+  # https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.QM9.html
   mode: train
   dataset:
     default_val: val
     train:
       src: /network/projects/ocp/qm9
-      target: 12 # predict atomization energy at 0K at index 12
+      target: 7
       seed: 123
       normalize_labels: True # mean and std of target will be set by qm9.py if this is True
       lse_shift: true
@@ -36,7 +36,7 @@ default:
         end: 110000
     val:
       src: /network/projects/ocp/qm9
-      target: 12 # predict atomization energy at 0K at index 12
+      target: 7
       seed: 123
       lse_shift: true
       indices:
@@ -44,7 +44,7 @@ default:
         end: 120000
     test:
       src: /network/projects/ocp/qm9
-      target: 12 # predict atomization energy at 0K at index 12
+      target: 7
       seed: 123
       lse_shift: true
       indices:

From 671e00a02b8c3a869afd3394f7f0ad237d600feb Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 25 Jan 2023 11:37:18 -0500
Subject: [PATCH 257/273] add spherenet  FROM DIG (PIUP INSTALL)

---
 configs/models/spherenet.yaml | 69 +++++++++++++++++++++++++++++++++++
 ocpmodels/models/spherenet.py | 58 +++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 configs/models/spherenet.yaml
 create mode 100644 ocpmodels/models/spherenet.py

diff --git a/configs/models/spherenet.yaml b/configs/models/spherenet.yaml
new file mode 100644
index 0000000000..1cea3c24bc
--- /dev/null
+++ b/configs/models/spherenet.yaml
@@ -0,0 +1,69 @@
+default:
+  model:
+    name: spherenet
+    use_pbc: True
+
+# -------------------
+# -----  IS2RE  -----
+# -------------------
+
+is2re:
+  10k: {}
+  100k: {}
+  all: {}
+
+# ------------------
+# -----  S2EF  -----
+# ------------------
+
+s2ef:
+  default: {}
+  200k: {}
+  2M: {}
+  20M: {}
+  all: {}
+
+qm9:
+  default:
+    model:
+      basis_emb_size_angle: 8
+      basis_emb_size_dist: 8
+      basis_emb_size_torsion: 8
+      cutoff: 5.0
+      energy_and_force: False
+      envelope_exponent: 5
+      hidden_channels: 128
+      int_emb_size: 64
+      num_after_skip: 2
+      num_before_skip: 1
+      num_layers: 4
+      num_output_layers: 3
+      num_radial: 6
+      num_spherical: 3
+      out_channels: 1
+      out_emb_channels: 256
+    optim:
+      batch_size: 1024
+      lr_initial: 0.001
+      max_epochs: 1000
+      decay_steps: 125000
+      decay_rate: 0.01
+      ema_decay: 0.999
+      lr_gamma: 0.25
+      lr_milestones:
+        - 17981
+        - 26972
+        - 35963
+        - 52000
+        - 100000
+      warmup_steps: 1000
+
+  10k: {}
+  all: {}
+
+qm7x:
+  default: {}
+  all: {}
+  1k: {}
+
+
diff --git a/ocpmodels/models/spherenet.py b/ocpmodels/models/spherenet.py
new file mode 100644
index 0000000000..7a5be57181
--- /dev/null
+++ b/ocpmodels/models/spherenet.py
@@ -0,0 +1,58 @@
+from dig.threedgraph.method import SphereNet as DIGSphereNet
+from ocpmodels.models.base_model import BaseModel
+import torch
+from ocpmodels.common.registry import registry
+from ocpmodels.common.utils import conditional_grad
+from copy import deepcopy
+
+
+class SphereNet(BaseModel):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.energy_and_force = kwargs.get("energy_and_force", False)
+        self.cutoff = kwargs.get("cutoff", 5.0)
+        self.num_layers = kwargs.get("num_layers", 4)
+        self.hidden_channels = kwargs.get("hidden_channels", 128)
+        self.out_channels = kwargs.get("out_channels", 1)
+        self.int_emb_size = kwargs.get("int_emb_size", 64)
+        self.basis_emb_size_dist = kwargs.get("basis_emb_size_dist", 8)
+        self.basis_emb_size_angle = kwargs.get("basis_emb_size_angle", 8)
+        self.basis_emb_size_torsion = kwargs.get("basis_emb_size_torsion", 8)
+        self.out_emb_channels = kwargs.get("out_emb_channels", 256)
+        self.num_spherical = kwargs.get("num_spherical", 3)
+        self.num_radial = kwargs.get("num_radial", 6)
+        self.envelope_exponent = kwargs.get("envelope_exponent", 5)
+        self.num_before_skip = kwargs.get("num_before_skip", 1)
+        self.num_after_skip = kwargs.get("num_after_skip", 2)
+        self.num_output_layers = kwargs.get("num_output_layers", 3)
+        self.spherenet = DIGSphereNet(
+            energy_and_force=self.energy_and_force,
+            cutoff=self.cutoff,
+            num_layers=self.num_layers,
+            hidden_channels=self.hidden_channels,
+            out_channels=self.out_channels,
+            int_emb_size=self.int_emb_size,
+            basis_emb_size_dist=self.basis_emb_size_dist,
+            basis_emb_size_angle=self.basis_emb_size_angle,
+            basis_emb_size_torsion=self.basis_emb_size_torsion,
+            out_emb_channels=self.out_emb_channels,
+            num_spherical=self.num_spherical,
+            num_radial=self.num_radial,
+            envelope_exponent=self.envelope_exponent,
+            num_before_skip=self.num_before_skip,
+            num_after_skip=self.num_after_skip,
+            num_output_layers=self.num_output_layers,
+        )
+
+    @conditional_grad(torch.enable_grad())
+    def forces_forward(self, preds):
+        return
+
+    @conditional_grad(torch.enable_grad())
+    def energy_forward(self, data):
+        # Rewire the graph
+        z = data.atomic_numbers.long()
+        batch_data = deepcopy(data)
+        batch_data.z = z
+
+        return self.spherenet.forward(batch_data)

From 37333a83e3485663a4d55a9938747e53147b0703 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:02:48 -0500
Subject: [PATCH 258/273] setup distributed in build config

---
 main.py                        | 4 ----
 ocpmodels/common/dist_utils.py | 3 ++-
 ocpmodels/common/utils.py      | 2 ++
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/main.py b/main.py
index cb81c7fb61..3060ec09c2 100644
--- a/main.py
+++ b/main.py
@@ -94,10 +94,6 @@ def wrap_up(args, start_time, error=None, signal=None, trainer=None):
 
     trainer_config = build_config(args, override_args)
 
-    if args.distributed:
-        dist_utils.setup(trainer_config)
-        print("Distributed backend setup.")
-
     if dist_utils.is_master():
         trainer_config = move_lmdb_data_to_slurm_tmpdir(trainer_config)
     dist_utils.synchronize()
diff --git a/ocpmodels/common/dist_utils.py b/ocpmodels/common/dist_utils.py
index 024b98e280..aab74a83ae 100644
--- a/ocpmodels/common/dist_utils.py
+++ b/ocpmodels/common/dist_utils.py
@@ -14,7 +14,8 @@
 
 
 def setup(config):
-    assert config["distributed"]
+    if not config["distributed"]:
+        return
     node_list = os.environ.get("SLURM_STEP_NODELIST")
     if node_list is None:
         node_list = os.environ.get("SLURM_JOB_NODELIST")
diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index f2e09c762a..bf17f5cb6c 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -38,6 +38,7 @@
 import ocpmodels
 from ocpmodels.common.flags import flags
 from ocpmodels.common.registry import registry
+import ocpmodels.common.dist_utils as dist_utils
 
 
 class Cluster:
@@ -1090,6 +1091,7 @@ def build_config(args, args_override, silent=False):
     config = continue_from_slurm_job_id(config)
     config = read_slurm_env(config)
     config["optim"]["eval_batch_size"] = config["optim"]["batch_size"]
+    dist_utils.setup(config)
 
     return config
 

From b1f51261e9ec464ac17d2050c7182124f58b2490 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:03:09 -0500
Subject: [PATCH 259/273] handle bool no arg in `create_dict_from_args`

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index bf17f5cb6c..a7b6231e60 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -807,7 +807,7 @@ def create_dict_from_args(args: list, sep: str = "."):
     return_dict = {}
     for arg in args:
         arg = arg.strip("--")
-        keys_concat, val = arg.split("=")
+        keys_concat, val = arg.split("=") if "=" in arg else (arg, "True")
         val = parse_value(val)
         key_sequence = keys_concat.split(sep)
         dict_set_recursively(return_dict, key_sequence, val)

From 760c220a18be00a8d858b090f59b2f95f2262d5e Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:03:23 -0500
Subject: [PATCH 260/273] clean up continued congif from previous timestamp,
 comit, distributed port etc.

---
 ocpmodels/common/utils.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index a7b6231e60..9e59856e54 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1041,6 +1041,18 @@ def build_config(args, args_override, silent=False):
     config["world_size"] = args.num_nodes * args.num_gpus
 
     if continue_config:
+        continue_config.pop("timestamp_id", None)
+        continue_config.pop("commit", None)
+        continue_config.pop("early_stopping_file", None)
+        continue_config.pop("timestamp_id", None)
+        continue_config.pop("distributed_port", None)
+        continue_config.pop("continue_from_dir", None)
+        continue_config.pop("restart_from_dir", None)
+
+        continue_config["run_dir"] = resolve(continue_config["run_dir"])
+        continue_config["job_id"] = JOB_ID
+        continue_config["local_rank"] = config["local_rank"]
+
         new_dirs = [
             (k, v) for k, v in config.items() if "dir" in k and k != "cp_data_to_tmpdir"
         ]

From 7ce80b0ad700b3b6747cdbf8486e5b2d8a8f7532 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:03:40 -0500
Subject: [PATCH 261/273] also resolve Paths in new_dirs when building conf

---
 ocpmodels/common/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index 9e59856e54..fd4e879512 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1067,7 +1067,7 @@ def build_config(args, args_override, silent=False):
         )
         config = merge_dicts(
             continue_config,
-            {k: resolve(v) if isinstance(v, str) else v for k, v in new_dirs},
+            {k: resolve(v) if isinstance(v, (str, Path)) else v for k, v in new_dirs},
         )
         config["dataset"] = merge_dicts(config["dataset"], data_srcs)
         cli = cli_args_dict()

From 785010355603269baf6c37e359677df6ff26bc08 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:03:57 -0500
Subject: [PATCH 262/273] handle list of dicts in merge_dicts

---
 ocpmodels/common/utils.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index fd4e879512..e093bd5850 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1459,9 +1459,9 @@ def merge_dicts(dict1: dict, dict2: dict) -> dict:
         Merged dictionaries.
     """
     if not isinstance(dict1, dict):
-        raise ValueError(f"Expecting dict1 to be dict, found {type(dict1)}.")
+        raise ValueError(f"Expecting dict1 to be dict, found {type(dict1)} {dict1}.")
     if not isinstance(dict2, dict):
-        raise ValueError(f"Expecting dict2 to be dict, found {type(dict2)}.")
+        raise ValueError(f"Expecting dict2 to be dict, found {type(dict2)} {dict2}.")
 
     return_dict = copy.deepcopy(dict1)
 
@@ -1477,7 +1477,21 @@ def merge_dicts(dict1: dict, dict2: dict) -> dict:
                         f"List for key {k} has different length in dict1 and dict2."
                         + " Use an empty dict {} to pad for items in the shorter list."
                     )
-                return_dict[k] = [merge_dicts(d1, d2) for d1, d2 in zip(dict1[k], v)]
+                if isinstance(dict1[k][0], dict):
+                    if not isinstance(dict2[k][0], dict):
+                        raise ValueError(
+                            f"Expecting dict for key {k} in dict2. ({dict1}, {dict2})"
+                        )
+                    return_dict[k] = [
+                        merge_dicts(d1, d2) for d1, d2 in zip(dict1[k], v)
+                    ]
+                else:
+                    if isinstance(dict2[k][0], dict):
+                        raise ValueError(
+                            f"Expecting dict for key {k} in dict1. ({dict1}, {dict2})"
+                        )
+                    return_dict[k] = v
+
             else:
                 return_dict[k] = dict2[k]
 

From 039e2dcb6c3ba92cf3f8faba88e4e093f29af849 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:04:10 -0500
Subject: [PATCH 263/273] fix cuda erro in `segment_coo`, use `unique`

---
 ocpmodels/common/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ocpmodels/common/utils.py b/ocpmodels/common/utils.py
index e093bd5850..0603b338be 100644
--- a/ocpmodels/common/utils.py
+++ b/ocpmodels/common/utils.py
@@ -1537,7 +1537,9 @@ def compute_neighbors(data, edge_index):
     # Get number of neighbors
     # segment_coo assumes sorted index
     ones = edge_index[1].new_ones(1).expand_as(edge_index[1])
-    num_neighbors = segment_coo(ones, edge_index[1], dim_size=data.natoms.sum())
+    # CUDA error, changing (victor 2023-01-25)
+    # num_neighbors = segment_coo(ones, edge_index[1], dim_size=data.natoms.sum())
+    _, num_neighbors = torch.unique(edge_index[1], return_counts=True)
 
     # Get number of neighbors per image
     image_indptr = torch.zeros(

From a0dd92fecadc6234e6759a5665f1f9669e341d65 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:06:12 -0500
Subject: [PATCH 264/273] Add PAINN model

---
 configs/models/painn.yaml                     |  70 ++
 ocpmodels/common/scaling/__init__.py          |   3 +
 ocpmodels/common/scaling/compat.py            |  76 ++
 ocpmodels/common/scaling/fit.py               | 241 +++++
 ocpmodels/common/scaling/scale_factor.py      | 170 ++++
 ocpmodels/common/scaling/util.py              |  23 +
 ocpmodels/models/painn.py                     | 879 ++++++++++++++++++
 ocpmodels/models/painn_nb6_scaling_factors.pt | Bin 0 -> 2199 bytes
 8 files changed, 1462 insertions(+)
 create mode 100644 configs/models/painn.yaml
 create mode 100644 ocpmodels/common/scaling/__init__.py
 create mode 100644 ocpmodels/common/scaling/compat.py
 create mode 100644 ocpmodels/common/scaling/fit.py
 create mode 100644 ocpmodels/common/scaling/scale_factor.py
 create mode 100644 ocpmodels/common/scaling/util.py
 create mode 100644 ocpmodels/models/painn.py
 create mode 100644 ocpmodels/models/painn_nb6_scaling_factors.pt

diff --git a/configs/models/painn.yaml b/configs/models/painn.yaml
new file mode 100644
index 0000000000..2c0abac112
--- /dev/null
+++ b/configs/models/painn.yaml
@@ -0,0 +1,70 @@
+default:
+  model:
+    name: painn
+    use_pbc: True
+
+# -------------------
+# -----  IS2RE  -----
+# -------------------
+
+is2re:
+  10k: {}
+  100k: {}
+  all: {}
+
+# ------------------
+# -----  S2EF  -----
+# ------------------
+
+s2ef:
+  default: {}
+  200k: {}
+  2M: {}
+  20M: {}
+  all: {}
+
+qm9:
+  default:
+    model:
+        num_atoms: null # useless
+        bond_feat_dim: null # useless
+        num_targets: null # useless
+        hidden_channels: 512
+        num_layers: 6
+        num_rbf: 128
+        cutoff: 12.0
+        max_neighbors: 50
+        rbf: {"name": "gaussian"}
+        envelope: {"name": "polynomial", "exponent": 5}
+        regress_forces: False
+        direct_forces: True
+        use_pbc: False
+        otf_graph: False
+        num_elements: 83
+    optim:
+      batch_size: 100
+      num_workers: 4
+      lr_initial: 0.001
+      max_epochs: 1000
+      decay_steps: 125000
+      decay_rate: 0.01
+      ema_decay: 0.999
+      # all below is for the scheduler
+      scheduler: ReduceLROnPlateau
+      mode: min
+      factor: 0.95
+      threshold: 0.0001
+      threshold_mode: abs
+      min_lr: 0.000001
+      verbose: true
+      patience: 10
+
+  10k: {}
+  all: {}
+
+qm7x:
+  default: {}
+  all: {}
+  1k: {}
+
+
diff --git a/ocpmodels/common/scaling/__init__.py b/ocpmodels/common/scaling/__init__.py
new file mode 100644
index 0000000000..807416b066
--- /dev/null
+++ b/ocpmodels/common/scaling/__init__.py
@@ -0,0 +1,3 @@
+from .scale_factor import ScaleFactor
+
+__all__ = ["ScaleFactor"]
diff --git a/ocpmodels/common/scaling/compat.py b/ocpmodels/common/scaling/compat.py
new file mode 100644
index 0000000000..4240db0556
--- /dev/null
+++ b/ocpmodels/common/scaling/compat.py
@@ -0,0 +1,76 @@
+import json
+import logging
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import torch
+import torch.nn as nn
+
+from .scale_factor import ScaleFactor
+
+ScaleDict = Union[Dict[str, float], Dict[str, torch.Tensor]]
+
+
+def _load_scale_dict(scale_file: Optional[Union[str, ScaleDict]]):
+    """
+    Loads scale factors from either:
+    - a JSON file mapping scale factor names to scale values
+    - a python dictionary pickled object (loaded using `torch.load`) mapping scale factor names to scale values
+    - a dictionary mapping scale factor names to scale values
+    """
+    if not scale_file:
+        return None
+
+    if isinstance(scale_file, dict):
+        if not scale_file:
+            logging.warning("Empty scale dictionary provided to model.")
+        return scale_file
+
+    path = Path(scale_file)
+    if not path.exists():
+        raise ValueError(f"Scale file {path} does not exist.")
+
+    scale_dict: Optional[ScaleDict] = None
+    if path.suffix == ".pt":
+        scale_dict = torch.load(path)
+    elif path.suffix == ".json":
+        with open(path, "r") as f:
+            scale_dict = json.load(f)
+
+        if isinstance(scale_dict, dict):
+            # old json scale factors have a comment field that has the model name
+            scale_dict.pop("comment", None)
+    else:
+        raise ValueError(f"Unsupported scale file extension: {path.suffix}")
+
+    if not scale_dict:
+        return None
+
+    return scale_dict
+
+
+def load_scales_compat(
+    module: nn.Module, scale_file: Optional[Union[str, ScaleDict]]
+):
+    scale_dict = _load_scale_dict(scale_file)
+    if not scale_dict:
+        return
+
+    scale_factors = {
+        module.name or name: (module, name)
+        for name, module in module.named_modules()
+        if isinstance(module, ScaleFactor)
+    }
+    logging.debug(
+        f"Found the following scale factors: {[(k, name) for k, (_, name) in scale_factors.items()]}"
+    )
+    for name, scale in scale_dict.items():
+        if name not in scale_factors:
+            logging.warning(f"Scale factor {name} not found in model")
+            continue
+
+        scale_module, module_name = scale_factors[name]
+        logging.debug(
+            f"Loading scale factor {scale} for ({name} => {module_name})"
+        )
+        scale_module.set_(scale)
diff --git a/ocpmodels/common/scaling/fit.py b/ocpmodels/common/scaling/fit.py
new file mode 100644
index 0000000000..83f1f72c7d
--- /dev/null
+++ b/ocpmodels/common/scaling/fit.py
@@ -0,0 +1,241 @@
+import logging
+import math
+import readline
+import sys
+from itertools import islice
+from pathlib import Path
+from typing import TYPE_CHECKING, Dict, Literal
+
+import torch
+import torch.nn as nn
+from torch.nn.parallel.distributed import DistributedDataParallel
+
+from ocpmodels.common.data_parallel import OCPDataParallel
+from ocpmodels.common.flags import flags
+from ocpmodels.common.utils import (
+    build_config,
+    new_trainer_context,
+    setup_logging,
+)
+from ocpmodels.modules.scaling import ScaleFactor
+from ocpmodels.modules.scaling.compat import load_scales_compat
+
+if TYPE_CHECKING:
+    from ocpmodels.trainers.base_trainer import BaseTrainer
+
+
+def _prefilled_input(prompt: str, prefill: str = ""):
+    readline.set_startup_hook(lambda: readline.insert_text(prefill))
+    try:
+        return input(prompt)
+    finally:
+        readline.set_startup_hook()
+
+
+def _train_batch(trainer: "BaseTrainer", batch):
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(enabled=trainer.scaler is not None):
+            out = trainer._forward(batch)
+        loss = trainer._compute_loss(out, batch)
+        del out, loss
+
+
+def main(*, num_batches: int = 16):
+    # region args/config setup
+    setup_logging()
+
+    parser = flags.get_parser()
+    args, override_args = parser.parse_known_args()
+    _config = build_config(args, override_args)
+    _config["logger"] = "tensorboard"
+    # endregion
+
+    assert not args.distributed, "This doesn't work with DDP"
+    with new_trainer_context(args=args, config=_config) as ctx:
+        config = ctx.config
+        trainer = ctx.trainer
+
+        ckpt_file = config.get("checkpoint", None)
+        assert (
+            ckpt_file is not None
+        ), "Checkpoint file not specified. Please specify --checkpoint <path>"
+        ckpt_file = Path(ckpt_file)
+
+        logging.info(
+            f"Input checkpoint path: {ckpt_file}, {ckpt_file.exists()=}"
+        )
+
+        model: nn.Module = trainer.model
+        val_loader = trainer.val_loader
+        assert (
+            val_loader is not None
+        ), "Val dataset is required for making predictions"
+
+        if ckpt_file.exists():
+            trainer.load_checkpoint(str(ckpt_file))
+
+        # region reoad scale file contents if necessary
+        # unwrap module from DP/DDP
+        unwrapped_model = model
+        while isinstance(
+            unwrapped_model, (DistributedDataParallel, OCPDataParallel)
+        ):
+            unwrapped_model = unwrapped_model.module
+        assert isinstance(
+            unwrapped_model, nn.Module
+        ), "Model is not a nn.Module"
+        load_scales_compat(unwrapped_model, config.get("scale_file", None))
+        # endregion
+
+        model.eval()
+
+        # recursively go through the submodules and get the ScaleFactor modules
+        scale_factors: Dict[str, ScaleFactor] = {
+            name: module
+            for name, module in model.named_modules()
+            if isinstance(module, ScaleFactor)
+        }
+
+        mode: Literal["all", "unfitted"] = "all"
+
+        # region detect fitted/unfitted factors
+        fitted_scale_factors = [
+            f"{name}: {module.scale_factor.item():.3f}"
+            for name, module in scale_factors.items()
+            if module.fitted
+        ]
+        unfitted_scale_factors = [
+            name for name, module in scale_factors.items() if not module.fitted
+        ]
+        fitted_scale_factors_str = ", ".join(fitted_scale_factors)
+        logging.info(f"Fitted scale factors: [{fitted_scale_factors_str}]")
+        unfitted_scale_factors_str = ", ".join(unfitted_scale_factors)
+        logging.info(f"Unfitted scale factors: [{unfitted_scale_factors_str}]")
+
+        if fitted_scale_factors:
+            flag = input(
+                "Do you want to continue and fit all scale factors (1), "
+                "only fit the variables not fitted yet (2), or exit (3)? "
+            )
+            if str(flag) == "1":
+                mode = "all"
+                logging.info("Fitting all scale factors.")
+            elif str(flag) == "2":
+                mode = "unfitted"
+                logging.info("Only fitting unfitted variables.")
+            else:
+                print(flag)
+                logging.info("Exiting script")
+                sys.exit()
+        # endregion
+
+        # region get the output path
+        out_path = Path(
+            _prefilled_input(
+                "Enter output path for fitted scale factors: ",
+                prefill=str(ckpt_file),
+            )
+        )
+        if out_path.exists():
+            logging.warning(f"Already found existing file: {out_path}")
+            flag = input(
+                "Do you want to continue and overwrite existing file (1), "
+                "or exit (2)? "
+            )
+            if str(flag) == "1":
+                logging.info("Overwriting existing file.")
+            else:
+                logging.info("Exiting script")
+                sys.exit()
+
+        logging.info(
+            f"Output path for fitted scale factors: {out_path}, {out_path.exists()=}"
+        )
+        # endregion
+
+        # region reset the scale factors if mode == "all"
+        if mode == "all":
+            logging.info("Fitting all scale factors.")
+            for name, scale_factor in scale_factors.items():
+                if scale_factor.fitted:
+                    logging.info(
+                        f"{name} is already fitted in the checkpoint, resetting it. {scale_factor.scale_factor}"
+                    )
+                scale_factor.reset_()
+        # endregion
+
+        # region we do a single pass through the network to get the correct execution order of the scale factors
+        scale_factor_indices: Dict[str, int] = {}
+        max_idx = 0
+
+        # initialize all scale factors
+        for name, module in scale_factors.items():
+
+            def index_fn(name=name):
+                nonlocal max_idx
+                assert name is not None
+                if name not in scale_factor_indices:
+                    scale_factor_indices[name] = max_idx
+                    logging.debug(f"Scale factor for {name} = {max_idx}")
+                    max_idx += 1
+
+            module.initialize_(index_fn=index_fn)
+
+        # single pass through network
+        _train_batch(trainer, next(iter(val_loader)))
+
+        # sort the scale factors by their computation order
+        sorted_factors = sorted(
+            scale_factors.items(),
+            key=lambda x: scale_factor_indices.get(x[0], math.inf),
+        )
+
+        logging.info("Sorted scale factors by computation order:")
+        for name, _ in sorted_factors:
+            logging.info(f"{name}: {scale_factor_indices[name]}")
+
+        # endregion
+
+        # loop over the scale factors in the computation order
+        # and fit them one by one
+        logging.info("Start fitting")
+
+        for name, module in sorted_factors:
+            if mode == "unfitted" and module.fitted:
+                logging.info(f"Skipping {name} (already fitted)")
+                continue
+
+            logging.info(f"Fitting {name}...")
+            with module.fit_context_():
+                for batch in islice(val_loader, num_batches):
+                    _train_batch(trainer, batch)
+                stats, ratio, value = module.fit_()
+
+                logging.info(
+                    f"Variable: {name}, "
+                    f"Var_in: {stats['variance_in']:.3f}, "
+                    f"Var_out: {stats['variance_out']:.3f}, "
+                    f"Ratio: {ratio:.3f} => Scaling factor: {value:.3f}"
+                )
+
+        # make sure all scale factors are fitted
+        for name, module in sorted_factors:
+            assert module.fitted, f"{name} is not fitted"
+
+        # region save the scale factors to the checkpoint file
+        trainer.config["cmd"]["checkpoint_dir"] = out_path.parent
+        trainer.is_debug = False
+        out_file = trainer.save(
+            metrics=None,
+            checkpoint_file=out_path.name,
+            training_state=False,
+        )
+        assert out_file is not None, "Failed to save checkpoint"
+        out_file = Path(out_file)
+        assert out_file.exists(), f"Failed to save checkpoint to {out_file}"
+        # endregion
+        logging.info(f"Saved results to: {out_file}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ocpmodels/common/scaling/scale_factor.py b/ocpmodels/common/scaling/scale_factor.py
new file mode 100644
index 0000000000..8a8d5a55a5
--- /dev/null
+++ b/ocpmodels/common/scaling/scale_factor.py
@@ -0,0 +1,170 @@
+import itertools
+import logging
+import math
+from contextlib import contextmanager
+from typing import Callable, Optional, TypedDict, Union
+
+import torch
+import torch.nn as nn
+
+
+class _Stats(TypedDict):
+    variance_in: float
+    variance_out: float
+    n_samples: int
+
+
+IndexFn = Callable[[], None]
+
+
+def _check_consistency(old: torch.Tensor, new: torch.Tensor, key: str):
+    if not torch.allclose(old, new):
+        raise ValueError(
+            f"Scale factor parameter {key} is inconsistent with the loaded state dict.\n"
+            f"Old: {old}\n"
+            f"Actual: {new}"
+        )
+
+
+class ScaleFactor(nn.Module):
+    scale_factor: torch.Tensor
+
+    name: Optional[str] = None
+    index_fn: Optional[IndexFn] = None
+    stats: Optional[_Stats] = None
+
+    def __init__(
+        self,
+        name: Optional[str] = None,
+        enforce_consistency: bool = True,
+    ):
+        super().__init__()
+
+        self.name = name
+        self.index_fn = None
+        self.stats = None
+
+        self.scale_factor = nn.parameter.Parameter(
+            torch.tensor(0.0), requires_grad=False
+        )
+        if enforce_consistency:
+            self._register_load_state_dict_pre_hook(self._enforce_consistency)
+
+    def _enforce_consistency(
+        self,
+        state_dict,
+        prefix,
+        _local_metadata,
+        _strict,
+        _missing_keys,
+        _unexpected_keys,
+        _error_msgs,
+    ):
+        if not self.fitted:
+            return
+
+        persistent_buffers = {
+            k: v
+            for k, v in self._buffers.items()
+            if k not in self._non_persistent_buffers_set
+        }
+        local_name_params = itertools.chain(
+            self._parameters.items(), persistent_buffers.items()
+        )
+        local_state = {k: v for k, v in local_name_params if v is not None}
+
+        for name, param in local_state.items():
+            key = prefix + name
+            if key not in state_dict:
+                continue
+
+            input_param = state_dict[key]
+            _check_consistency(old=param, new=input_param, key=key)
+
+    @property
+    def fitted(self):
+        return bool((self.scale_factor != 0.0).item())
+
+    @torch.jit.unused
+    def reset_(self):
+        self.scale_factor.zero_()
+
+    @torch.jit.unused
+    def set_(self, scale: Union[float, torch.Tensor]):
+        if self.fitted:
+            _check_consistency(
+                old=self.scale_factor,
+                new=torch.tensor(scale) if isinstance(scale, float) else scale,
+                key="scale_factor",
+            )
+        self.scale_factor.fill_(scale)
+
+    @torch.jit.unused
+    def initialize_(self, *, index_fn: Optional[IndexFn] = None):
+        self.index_fn = index_fn
+
+    @contextmanager
+    @torch.jit.unused
+    def fit_context_(self):
+        self.stats = _Stats(variance_in=0.0, variance_out=0.0, n_samples=0)
+        yield
+        del self.stats
+        self.stats = None
+
+    @torch.jit.unused
+    def fit_(self):
+        assert self.stats, "Stats not set"
+        for k, v in self.stats.items():
+            assert v > 0, f"{k} is {v}"
+
+        self.stats["variance_in"] = (
+            self.stats["variance_in"] / self.stats["n_samples"]
+        )
+        self.stats["variance_out"] = (
+            self.stats["variance_out"] / self.stats["n_samples"]
+        )
+
+        ratio = self.stats["variance_out"] / self.stats["variance_in"]
+        value = math.sqrt(1 / ratio)
+
+        self.set_(value)
+
+        stats = dict(**self.stats)
+        return stats, ratio, value
+
+    @torch.no_grad()
+    @torch.jit.unused
+    def _observe(self, x: torch.Tensor, ref: Optional[torch.Tensor] = None):
+        if self.stats is None:
+            logging.debug("Observer not initialized but self.observe() called")
+            return
+
+        n_samples = x.shape[0]
+        self.stats["variance_out"] += (
+            torch.mean(torch.var(x, dim=0)).item() * n_samples
+        )
+
+        if ref is None:
+            self.stats["variance_in"] += n_samples
+        else:
+            self.stats["variance_in"] += (
+                torch.mean(torch.var(ref, dim=0)).item() * n_samples
+            )
+        self.stats["n_samples"] += n_samples
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        *,
+        ref: Optional[torch.Tensor] = None,
+    ):
+        if self.index_fn is not None:
+            self.index_fn()
+
+        if self.fitted:
+            x = x * self.scale_factor
+
+        if not torch.jit.is_scripting():
+            self._observe(x, ref=ref)
+
+        return x
diff --git a/ocpmodels/common/scaling/util.py b/ocpmodels/common/scaling/util.py
new file mode 100644
index 0000000000..15c58b5d42
--- /dev/null
+++ b/ocpmodels/common/scaling/util.py
@@ -0,0 +1,23 @@
+import logging
+
+import torch.nn as nn
+
+from .scale_factor import ScaleFactor
+
+
+def ensure_fitted(module: nn.Module, warn: bool = False):
+    for name, child in module.named_modules():
+        if not isinstance(child, ScaleFactor) or child.fitted:
+            continue
+        if child.name is not None:
+            name = f"{child.name} ({name})"
+        msg = (
+            f"Scale factor {name} is not fitted. "
+            "Please make sure that you either (1) load a checkpoint with fitted scale factors, "
+            "(2) explicitly load scale factors using the `model.scale_file` attribute, or "
+            "(3) fit the scale factors using the `fit.py` script."
+        )
+        if warn:
+            logging.warning(msg)
+        else:
+            raise ValueError(msg)
diff --git a/ocpmodels/models/painn.py b/ocpmodels/models/painn.py
new file mode 100644
index 0000000000..8b2f5d45c3
--- /dev/null
+++ b/ocpmodels/models/painn.py
@@ -0,0 +1,879 @@
+"""
+Copyright (c) Facebook, Inc. and its affiliates.
+
+This source code is licensed under the MIT license found in the
+LICENSE file in the root directory of this source tree.
+
+---
+
+MIT License
+
+Copyright (c) 2021 www.compscience.org
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+
+import logging
+import math
+import os
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from torch_geometric.nn import MessagePassing, radius_graph
+from torch_scatter import scatter, segment_coo, segment_csr
+
+from ocpmodels.common.registry import registry
+from ocpmodels.common.utils import (
+    compute_neighbors,
+    conditional_grad,
+    get_pbc_distances,
+    radius_graph_pbc,
+    ROOT,
+)
+import numpy as np
+from ocpmodels.models.base_model import BaseModel
+from ocpmodels.models.gemnet.layers.base_layers import ScaledSiLU
+
+# from ocpmodels.models.gemnet.layers.embedding_block import AtomEmbedding # updated version copied here
+from ocpmodels.models.gemnet.layers.radial_basis import RadialBasis
+from ocpmodels.common.scaling import ScaleFactor
+from ocpmodels.common.scaling.compat import load_scales_compat
+
+# from .utils import get_edge_id, repeat_blocks copied here
+
+
+class AtomEmbedding(torch.nn.Module):
+    """
+    Initial atom embeddings based on the atom type
+
+    Parameters
+    ----------
+        emb_size: int
+            Atom embeddings size
+    """
+
+    def __init__(self, emb_size, num_elements):
+        super().__init__()
+        self.emb_size = emb_size
+
+        self.embeddings = torch.nn.Embedding(num_elements, emb_size)
+        # init by uniform distribution
+        torch.nn.init.uniform_(self.embeddings.weight, a=-np.sqrt(3), b=np.sqrt(3))
+
+    def forward(self, Z):
+        """
+        Returns
+        -------
+            h: torch.Tensor, shape=(nAtoms, emb_size)
+                Atom embeddings.
+        """
+        h = self.embeddings(Z - 1)  # -1 because Z.min()=1 (==Hydrogen)
+        return h
+
+
+def repeat_blocks(
+    sizes,
+    repeats,
+    continuous_indexing=True,
+    start_idx=0,
+    block_inc=0,
+    repeat_inc=0,
+):
+    """Repeat blocks of indices.
+    Adapted from https://stackoverflow.com/questions/51154989/numpy-vectorized-function-to-repeat-blocks-of-consecutive-elements
+
+    continuous_indexing: Whether to keep increasing the index after each block
+    start_idx: Starting index
+    block_inc: Number to increment by after each block,
+               either global or per block. Shape: len(sizes) - 1
+    repeat_inc: Number to increment by after each repetition,
+                either global or per block
+
+    Examples
+    --------
+        sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = False
+        Return: [0 0 0  0 1 2 0 1 2  0 1 0 1 0 1]
+        sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = True
+        Return: [0 0 0  1 2 3 1 2 3  4 5 4 5 4 5]
+        sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = True ;
+        repeat_inc = 4
+        Return: [0 4 8  1 2 3 5 6 7  4 5 8 9 12 13]
+        sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = True ;
+        start_idx = 5
+        Return: [5 5 5  6 7 8 6 7 8  9 10 9 10 9 10]
+        sizes = [1,3,2] ; repeats = [3,2,3] ; continuous_indexing = True ;
+        block_inc = 1
+        Return: [0 0 0  2 3 4 2 3 4  6 7 6 7 6 7]
+        sizes = [0,3,2] ; repeats = [3,2,3] ; continuous_indexing = True
+        Return: [0 1 2 0 1 2  3 4 3 4 3 4]
+        sizes = [2,3,2] ; repeats = [2,0,2] ; continuous_indexing = True
+        Return: [0 1 0 1  5 6 5 6]
+    """
+    assert sizes.dim() == 1
+    assert all(sizes >= 0)
+
+    # Remove 0 sizes
+    sizes_nonzero = sizes > 0
+    if not torch.all(sizes_nonzero):
+        assert block_inc == 0  # Implementing this is not worth the effort
+        sizes = torch.masked_select(sizes, sizes_nonzero)
+        if isinstance(repeats, torch.Tensor):
+            repeats = torch.masked_select(repeats, sizes_nonzero)
+        if isinstance(repeat_inc, torch.Tensor):
+            repeat_inc = torch.masked_select(repeat_inc, sizes_nonzero)
+
+    if isinstance(repeats, torch.Tensor):
+        assert all(repeats >= 0)
+        insert_dummy = repeats[0] == 0
+        if insert_dummy:
+            one = sizes.new_ones(1)
+            zero = sizes.new_zeros(1)
+            sizes = torch.cat((one, sizes))
+            repeats = torch.cat((one, repeats))
+            if isinstance(block_inc, torch.Tensor):
+                block_inc = torch.cat((zero, block_inc))
+            if isinstance(repeat_inc, torch.Tensor):
+                repeat_inc = torch.cat((zero, repeat_inc))
+    else:
+        assert repeats >= 0
+        insert_dummy = False
+
+    # Get repeats for each group using group lengths/sizes
+    r1 = torch.repeat_interleave(torch.arange(len(sizes), device=sizes.device), repeats)
+
+    # Get total size of output array, as needed to initialize output indexing array
+    N = (sizes * repeats).sum()
+
+    # Initialize indexing array with ones as we need to setup incremental indexing
+    # within each group when cumulatively summed at the final stage.
+    # Two steps here:
+    # 1. Within each group, we have multiple sequences, so setup the offsetting
+    # at each sequence lengths by the seq. lengths preceding those.
+    id_ar = torch.ones(N, dtype=torch.long, device=sizes.device)
+    id_ar[0] = 0
+    insert_index = sizes[r1[:-1]].cumsum(0)
+    insert_val = (1 - sizes)[r1[:-1]]
+
+    if isinstance(repeats, torch.Tensor) and torch.any(repeats == 0):
+        diffs = r1[1:] - r1[:-1]
+        indptr = torch.cat((sizes.new_zeros(1), diffs.cumsum(0)))
+        if continuous_indexing:
+            # If a group was skipped (repeats=0) we need to add its size
+            insert_val += segment_csr(sizes[: r1[-1]], indptr, reduce="sum")
+
+        # Add block increments
+        if isinstance(block_inc, torch.Tensor):
+            insert_val += segment_csr(block_inc[: r1[-1]], indptr, reduce="sum")
+        else:
+            insert_val += block_inc * (indptr[1:] - indptr[:-1])
+            if insert_dummy:
+                insert_val[0] -= block_inc
+    else:
+        idx = r1[1:] != r1[:-1]
+        if continuous_indexing:
+            # 2. For each group, make sure the indexing starts from the next group's
+            # first element. So, simply assign 1s there.
+            insert_val[idx] = 1
+
+        # Add block increments
+        insert_val[idx] += block_inc
+
+    # Add repeat_inc within each group
+    if isinstance(repeat_inc, torch.Tensor):
+        insert_val += repeat_inc[r1[:-1]]
+        if isinstance(repeats, torch.Tensor):
+            repeat_inc_inner = repeat_inc[repeats > 0][:-1]
+        else:
+            repeat_inc_inner = repeat_inc[:-1]
+    else:
+        insert_val += repeat_inc
+        repeat_inc_inner = repeat_inc
+
+    # Subtract the increments between groups
+    if isinstance(repeats, torch.Tensor):
+        repeats_inner = repeats[repeats > 0][:-1]
+    else:
+        repeats_inner = repeats
+    insert_val[r1[1:] != r1[:-1]] -= repeat_inc_inner * repeats_inner
+
+    # Assign index-offsetting values
+    id_ar[insert_index] = insert_val
+
+    if insert_dummy:
+        id_ar = id_ar[1:]
+        if continuous_indexing:
+            id_ar[0] -= 1
+
+    # Set start index now, in case of insertion due to leading repeats=0
+    id_ar[0] += start_idx
+
+    # Finally index into input array for the group repeated o/p
+    res = id_ar.cumsum(0)
+    return res
+
+
+def get_edge_id(edge_idx, cell_offsets, num_atoms):
+    cell_basis = cell_offsets.max() - cell_offsets.min() + 1
+    cell_id = (
+        (cell_offsets * cell_offsets.new_tensor([[1, cell_basis, cell_basis**2]]))
+        .sum(-1)
+        .long()
+    )
+    edge_id = edge_idx[0] + edge_idx[1] * num_atoms + cell_id * num_atoms**2
+    return edge_id
+
+
+@registry.register_model("painn")
+class PaiNN(BaseModel):
+    r"""PaiNN model based on the description in Schütt et al. (2021):
+    Equivariant message passing for the prediction of tensorial properties
+    and molecular spectra, https://arxiv.org/abs/2102.03150.
+    """
+
+    def __init__(self, **kwargs):
+        super(PaiNN, self).__init__()
+        self.num_atoms = kwargs.get("num_atoms")
+        self.bond_feat_dim = kwargs.get("bond_feat_dim")
+        self.num_targets = kwargs.get("num_targets")
+        self.hidden_channels = kwargs.get("hidden_channels", 512)
+        self.num_layers = kwargs.get("num_layers", 6)
+        self.num_rbf = kwargs.get("num_rbf", 128)
+        self.cutoff = kwargs.get("cutoff", 12.0)
+        self.max_neighbors = kwargs.get("max_neighbors", 50)
+        self.rbf = kwargs.get("rbf", {"name": "gaussian"})
+        self.envelope = kwargs.get("envelope", {"name": "polynomial", "exponent": 5})
+        self.regress_forces = kwargs.get("regress_forces", True)
+        self.direct_forces = kwargs.get("direct_forces", True)
+        self.use_pbc = kwargs.get("use_pbc", True)
+        self.otf_graph = kwargs.get("otf_graph", True)
+        self.num_elements = kwargs.get("num_elements", 83)
+        self.scale_file = ROOT / "ocpmodels" / "models" / "painn_nb6_scaling_factors.pt"
+
+        # Borrowed from GemNet.
+        self.symmetric_edge_symmetrization = False
+
+        #### Learnable parameters #############################################
+
+        self.atom_emb = AtomEmbedding(self.hidden_channels, self.num_elements)
+
+        self.radial_basis = RadialBasis(
+            num_radial=self.num_rbf,
+            cutoff=self.cutoff,
+            rbf=self.rbf,
+            envelope=self.envelope,
+        )
+
+        self.message_layers = nn.ModuleList()
+        self.update_layers = nn.ModuleList()
+
+        for i in range(self.num_layers):
+            self.message_layers.append(
+                PaiNNMessage(self.hidden_channels, self.num_rbf).jittable()
+            )
+            self.update_layers.append(PaiNNUpdate(self.hidden_channels))
+            setattr(self, "upd_out_scalar_scale_%d" % i, ScaleFactor())
+
+        self.out_energy = nn.Sequential(
+            nn.Linear(self.hidden_channels, self.hidden_channels // 2),
+            ScaledSiLU(),
+            nn.Linear(self.hidden_channels // 2, 1),
+        )
+
+        if self.regress_forces is True and self.direct_forces is True:
+            self.out_forces = PaiNNOutput(self.hidden_channels)
+
+        self.inv_sqrt_2 = 1 / math.sqrt(2.0)
+
+        self.reset_parameters()
+
+        load_scales_compat(self, self.scale_file)
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.out_energy[0].weight)
+        self.out_energy[0].bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_energy[2].weight)
+        self.out_energy[2].bias.data.fill_(0)
+
+    # Borrowed from GemNet.
+    def select_symmetric_edges(self, tensor, mask, reorder_idx, inverse_neg):
+        # Mask out counter-edges
+        tensor_directed = tensor[mask]
+        # Concatenate counter-edges after normal edges
+        sign = 1 - 2 * inverse_neg
+        tensor_cat = torch.cat([tensor_directed, sign * tensor_directed])
+        # Reorder everything so the edges of every image are consecutive
+        tensor_ordered = tensor_cat[reorder_idx]
+        return tensor_ordered
+
+    # Borrowed from GemNet.
+    def symmetrize_edges(
+        self,
+        edge_index,
+        cell_offsets,
+        neighbors,
+        batch_idx,
+        reorder_tensors,
+        reorder_tensors_invneg,
+    ):
+        """
+        Symmetrize edges to ensure existence of counter-directional edges.
+
+        Some edges are only present in one direction in the data,
+        since every atom has a maximum number of neighbors.
+        If `symmetric_edge_symmetrization` is False,
+        we only use i->j edges here. So we lose some j->i edges
+        and add others by making it symmetric.
+        If `symmetric_edge_symmetrization` is True,
+        we always use both directions.
+        """
+        num_atoms = batch_idx.shape[0]
+
+        if self.symmetric_edge_symmetrization:
+            edge_index_bothdir = torch.cat(
+                [edge_index, edge_index.flip(0)],
+                dim=1,
+            )
+            cell_offsets_bothdir = torch.cat(
+                [cell_offsets, -cell_offsets],
+                dim=0,
+            )
+
+            # Filter for unique edges
+            edge_ids = get_edge_id(edge_index_bothdir, cell_offsets_bothdir, num_atoms)
+            unique_ids, unique_inv = torch.unique(edge_ids, return_inverse=True)
+            perm = torch.arange(
+                unique_inv.size(0),
+                dtype=unique_inv.dtype,
+                device=unique_inv.device,
+            )
+            unique_idx = scatter(
+                perm,
+                unique_inv,
+                dim=0,
+                dim_size=unique_ids.shape[0],
+                reduce="min",
+            )
+            edge_index_new = edge_index_bothdir[:, unique_idx]
+
+            # Order by target index
+            edge_index_order = torch.argsort(edge_index_new[1])
+            edge_index_new = edge_index_new[:, edge_index_order]
+            unique_idx = unique_idx[edge_index_order]
+
+            # Subindex remaining tensors
+            cell_offsets_new = cell_offsets_bothdir[unique_idx]
+            reorder_tensors = [
+                self.symmetrize_tensor(tensor, unique_idx, False)
+                for tensor in reorder_tensors
+            ]
+            reorder_tensors_invneg = [
+                self.symmetrize_tensor(tensor, unique_idx, True)
+                for tensor in reorder_tensors_invneg
+            ]
+
+            # Count edges per image
+            # segment_coo assumes sorted edge_index_new[1] and batch_idx
+            ones = edge_index_new.new_ones(1).expand_as(edge_index_new[1])
+            neighbors_per_atom = segment_coo(
+                ones, edge_index_new[1], dim_size=num_atoms
+            )
+            neighbors_per_image = segment_coo(
+                neighbors_per_atom, batch_idx, dim_size=neighbors.shape[0]
+            )
+        else:
+            # Generate mask
+            mask_sep_atoms = edge_index[0] < edge_index[1]
+            # Distinguish edges between the same (periodic) atom by ordering the cells
+            cell_earlier = (
+                (cell_offsets[:, 0] < 0)
+                | ((cell_offsets[:, 0] == 0) & (cell_offsets[:, 1] < 0))
+                | (
+                    (cell_offsets[:, 0] == 0)
+                    & (cell_offsets[:, 1] == 0)
+                    & (cell_offsets[:, 2] < 0)
+                )
+            )
+            mask_same_atoms = edge_index[0] == edge_index[1]
+            mask_same_atoms &= cell_earlier
+            mask = mask_sep_atoms | mask_same_atoms
+
+            # Mask out counter-edges
+            edge_index_new = edge_index[mask[None, :].expand(2, -1)].view(2, -1)
+
+            # Concatenate counter-edges after normal edges
+            edge_index_cat = torch.cat(
+                [edge_index_new, edge_index_new.flip(0)],
+                dim=1,
+            )
+
+            # Count remaining edges per image
+            batch_edge = torch.repeat_interleave(
+                torch.arange(neighbors.size(0), device=edge_index.device),
+                neighbors,
+            )
+            batch_edge = batch_edge[mask]
+            # segment_coo assumes sorted batch_edge
+            # Factor 2 since this is only one half of the edges
+            ones = batch_edge.new_ones(1).expand_as(batch_edge)
+            neighbors_per_image = 2 * segment_coo(
+                ones, batch_edge, dim_size=neighbors.size(0)
+            )
+
+            # Create indexing array
+            edge_reorder_idx = repeat_blocks(
+                torch.div(neighbors_per_image, 2, rounding_mode="floor"),
+                repeats=2,
+                continuous_indexing=True,
+                repeat_inc=edge_index_new.size(1),
+            )
+
+            # Reorder everything so the edges of every image are consecutive
+            edge_index_new = edge_index_cat[:, edge_reorder_idx]
+            cell_offsets_new = self.select_symmetric_edges(
+                cell_offsets, mask, edge_reorder_idx, True
+            )
+            reorder_tensors = [
+                self.select_symmetric_edges(tensor, mask, edge_reorder_idx, False)
+                for tensor in reorder_tensors
+            ]
+            reorder_tensors_invneg = [
+                self.select_symmetric_edges(tensor, mask, edge_reorder_idx, True)
+                for tensor in reorder_tensors_invneg
+            ]
+
+        # Indices for swapping c->a and a->c (for symmetric MP)
+        # To obtain these efficiently and without any index assumptions,
+        # we get order the counter-edge IDs and then
+        # map this order back to the edge IDs.
+        # Double argsort gives the desired mapping
+        # from the ordered tensor to the original tensor.
+        edge_ids = get_edge_id(edge_index_new, cell_offsets_new, num_atoms)
+        order_edge_ids = torch.argsort(edge_ids)
+        inv_order_edge_ids = torch.argsort(order_edge_ids)
+        edge_ids_counter = get_edge_id(
+            edge_index_new.flip(0), -cell_offsets_new, num_atoms
+        )
+        order_edge_ids_counter = torch.argsort(edge_ids_counter)
+        id_swap = order_edge_ids_counter[inv_order_edge_ids]
+
+        return (
+            edge_index_new,
+            cell_offsets_new,
+            neighbors_per_image,
+            reorder_tensors,
+            reorder_tensors_invneg,
+            id_swap,
+        )
+
+    def generate_graph(
+        self,
+        data,
+        cutoff=None,
+        max_neighbors=None,
+        use_pbc=None,
+        otf_graph=None,
+    ):
+        cutoff = cutoff or self.cutoff
+        max_neighbors = max_neighbors or self.max_neighbors
+        use_pbc = use_pbc or self.use_pbc
+        otf_graph = otf_graph or self.otf_graph
+
+        if not otf_graph:
+            try:
+                edge_index = data.edge_index
+
+                if use_pbc:
+                    cell_offsets = data.cell_offsets
+                    neighbors = data.neighbors
+
+            except AttributeError:
+                logging.warning(
+                    "Turning otf_graph=True as required attributes not present in data object"
+                )
+                otf_graph = True
+
+        if use_pbc:
+            if otf_graph:
+                edge_index, cell_offsets, neighbors = radius_graph_pbc(
+                    data, cutoff, max_neighbors
+                )
+
+            out = get_pbc_distances(
+                data.pos,
+                edge_index,
+                data.cell,
+                cell_offsets,
+                neighbors,
+                return_offsets=True,
+                return_distance_vec=True,
+            )
+
+            edge_index = out["edge_index"]
+            edge_dist = out["distances"]
+            cell_offset_distances = out["offsets"]
+            distance_vec = out["distance_vec"]
+        else:
+            if otf_graph:
+                edge_index = radius_graph(
+                    data.pos,
+                    r=cutoff,
+                    batch=data.batch,
+                    max_num_neighbors=max_neighbors,
+                )
+
+            j, i = edge_index
+            distance_vec = data.pos[j] - data.pos[i]
+
+            edge_dist = distance_vec.norm(dim=-1)
+            cell_offsets = torch.zeros(edge_index.shape[1], 3, device=data.pos.device)
+            cell_offset_distances = torch.zeros_like(
+                cell_offsets, device=data.pos.device
+            )
+            neighbors = compute_neighbors(data, edge_index)
+
+        return (
+            edge_index,
+            edge_dist,
+            distance_vec,
+            cell_offsets,
+            cell_offset_distances,
+            neighbors,
+        )
+
+    def generate_graph_values(self, data):
+        (
+            edge_index,
+            edge_dist,
+            distance_vec,
+            cell_offsets,
+            _,  # cell offset distances
+            neighbors,
+        ) = self.generate_graph(data)
+
+        # Unit vectors pointing from edge_index[1] to edge_index[0],
+        # i.e., edge_index[0] - edge_index[1] divided by the norm.
+        # make sure that the distances are not close to zero before dividing
+        mask_zero = torch.isclose(edge_dist, torch.tensor(0.0), atol=1e-6)
+        edge_dist[mask_zero] = 1.0e-6
+        edge_vector = distance_vec / edge_dist[:, None]
+
+        empty_image = neighbors == 0
+        if torch.any(empty_image):
+            raise ValueError(
+                f"An image has no neighbors: id={data.id[empty_image]}, "
+                f"sid={data.sid[empty_image]}, fid={data.fid[empty_image]}"
+            )
+
+        # Symmetrize edges for swapping in symmetric message passing
+        (
+            edge_index,
+            cell_offsets,
+            neighbors,
+            [edge_dist],
+            [edge_vector],
+            id_swap,
+        ) = self.symmetrize_edges(
+            edge_index,
+            cell_offsets,
+            neighbors,
+            data.batch,
+            [edge_dist],
+            [edge_vector],
+        )
+
+        return (
+            edge_index,
+            neighbors,
+            edge_dist,
+            edge_vector,
+            id_swap,
+        )
+
+    @conditional_grad(torch.enable_grad())
+    def forces_forward(self, preds):
+        return
+
+    @conditional_grad(torch.enable_grad())
+    def energy_forward(self, data):
+        pos = data.pos
+        batch = data.batch
+        z = data.atomic_numbers.long()
+
+        if self.regress_forces and not self.direct_forces:
+            pos = pos.requires_grad_(True)
+
+        (
+            edge_index,
+            neighbors,
+            edge_dist,
+            edge_vector,
+            id_swap,
+        ) = self.generate_graph_values(data)
+
+        assert z.dim() == 1 and z.dtype == torch.long
+
+        edge_rbf = self.radial_basis(edge_dist)  # rbf * envelope
+
+        x = self.atom_emb(z)
+        vec = torch.zeros(x.size(0), 3, x.size(1), device=x.device)
+
+        #### Interaction blocks ###############################################
+
+        for i in range(self.num_layers):
+            dx, dvec = self.message_layers[i](x, vec, edge_index, edge_rbf, edge_vector)
+
+            x = x + dx
+            vec = vec + dvec
+            x = x * self.inv_sqrt_2
+
+            dx, dvec = self.update_layers[i](x, vec)
+
+            x = x + dx
+            vec = vec + dvec
+            x = getattr(self, "upd_out_scalar_scale_%d" % i)(x)
+
+        #### Output block #####################################################
+
+        per_atom_energy = self.out_energy(x).squeeze(1)
+        energy = scatter(per_atom_energy, batch, dim=0)
+
+        if self.regress_forces:
+            if self.direct_forces:
+                forces = self.out_forces(x, vec)
+                return energy, forces
+            else:
+                forces = (
+                    -1
+                    * torch.autograd.grad(
+                        x,
+                        pos,
+                        grad_outputs=torch.ones_like(x),
+                        create_graph=True,
+                    )[0]
+                )
+                return energy, forces
+        else:
+            return {"energy": energy}
+
+    @property
+    def num_params(self):
+        return sum(p.numel() for p in self.parameters())
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f"hidden_channels={self.hidden_channels}, "
+            f"num_layers={self.num_layers}, "
+            f"num_rbf={self.num_rbf}, "
+            f"max_neighbors={self.max_neighbors}, "
+            f"cutoff={self.cutoff})"
+        )
+
+
+class PaiNNMessage(MessagePassing):
+    def __init__(
+        self,
+        hidden_channels,
+        num_rbf,
+    ):
+        super(PaiNNMessage, self).__init__(aggr="add", node_dim=0)
+
+        self.hidden_channels = hidden_channels
+
+        self.x_proj = nn.Sequential(
+            nn.Linear(hidden_channels, hidden_channels),
+            ScaledSiLU(),
+            nn.Linear(hidden_channels, hidden_channels * 3),
+        )
+        self.rbf_proj = nn.Linear(num_rbf, hidden_channels * 3)
+
+        self.inv_sqrt_3 = 1 / math.sqrt(3.0)
+        self.inv_sqrt_h = 1 / math.sqrt(hidden_channels)
+        self.x_layernorm = nn.LayerNorm(hidden_channels)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.x_proj[0].weight)
+        self.x_proj[0].bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.x_proj[2].weight)
+        self.x_proj[2].bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.rbf_proj.weight)
+        self.rbf_proj.bias.data.fill_(0)
+        self.x_layernorm.reset_parameters()
+
+    def forward(self, x, vec, edge_index, edge_rbf, edge_vector):
+        xh = self.x_proj(self.x_layernorm(x))
+
+        # TODO(@abhshkdz): Nans out with AMP here during backprop. Debug / fix.
+        rbfh = self.rbf_proj(edge_rbf)
+
+        # propagate_type: (xh: Tensor, vec: Tensor, rbfh_ij: Tensor, r_ij: Tensor)
+        dx, dvec = self.propagate(
+            edge_index,
+            xh=xh,
+            vec=vec,
+            rbfh_ij=rbfh,
+            r_ij=edge_vector,
+            size=None,
+        )
+
+        return dx, dvec
+
+    def message(self, xh_j, vec_j, rbfh_ij, r_ij):
+        x, xh2, xh3 = torch.split(xh_j * rbfh_ij, self.hidden_channels, dim=-1)
+        xh2 = xh2 * self.inv_sqrt_3
+
+        vec = vec_j * xh2.unsqueeze(1) + xh3.unsqueeze(1) * r_ij.unsqueeze(2)
+        vec = vec * self.inv_sqrt_h
+
+        return x, vec
+
+    def aggregate(
+        self,
+        features: Tuple[torch.Tensor, torch.Tensor],
+        index: torch.Tensor,
+        ptr: Optional[torch.Tensor],
+        dim_size: Optional[int],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        x, vec = features
+        x = scatter(x, index, dim=self.node_dim, dim_size=dim_size)
+        vec = scatter(vec, index, dim=self.node_dim, dim_size=dim_size)
+        return x, vec
+
+    def update(
+        self, inputs: Tuple[torch.Tensor, torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return inputs
+
+
+class PaiNNUpdate(nn.Module):
+    def __init__(self, hidden_channels):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+
+        self.vec_proj = nn.Linear(hidden_channels, hidden_channels * 2, bias=False)
+        self.xvec_proj = nn.Sequential(
+            nn.Linear(hidden_channels * 2, hidden_channels),
+            ScaledSiLU(),
+            nn.Linear(hidden_channels, hidden_channels * 3),
+        )
+
+        self.inv_sqrt_2 = 1 / math.sqrt(2.0)
+        self.inv_sqrt_h = 1 / math.sqrt(hidden_channels)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.vec_proj.weight)
+        nn.init.xavier_uniform_(self.xvec_proj[0].weight)
+        self.xvec_proj[0].bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.xvec_proj[2].weight)
+        self.xvec_proj[2].bias.data.fill_(0)
+
+    def forward(self, x, vec):
+        vec1, vec2 = torch.split(self.vec_proj(vec), self.hidden_channels, dim=-1)
+        vec_dot = (vec1 * vec2).sum(dim=1) * self.inv_sqrt_h
+
+        # NOTE: Can't use torch.norm because the gradient is NaN for input = 0.
+        # Add an epsilon offset to make sure sqrt is always positive.
+        x_vec_h = self.xvec_proj(
+            torch.cat([x, torch.sqrt(torch.sum(vec2**2, dim=-2) + 1e-8)], dim=-1)
+        )
+        xvec1, xvec2, xvec3 = torch.split(x_vec_h, self.hidden_channels, dim=-1)
+
+        dx = xvec1 + xvec2 * vec_dot
+        dx = dx * self.inv_sqrt_2
+
+        dvec = xvec3.unsqueeze(1) * vec1
+
+        return dx, dvec
+
+
+class PaiNNOutput(nn.Module):
+    def __init__(self, hidden_channels):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+
+        self.output_network = nn.ModuleList(
+            [
+                GatedEquivariantBlock(
+                    hidden_channels,
+                    hidden_channels // 2,
+                ),
+                GatedEquivariantBlock(hidden_channels // 2, 1),
+            ]
+        )
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        for layer in self.output_network:
+            layer.reset_parameters()
+
+    def forward(self, x, vec):
+        for layer in self.output_network:
+            x, vec = layer(x, vec)
+        return vec.squeeze()
+
+
+# Borrowed from TorchMD-Net
+class GatedEquivariantBlock(nn.Module):
+    """Gated Equivariant Block as defined in Schütt et al. (2021):
+    Equivariant message passing for the prediction of tensorial properties and molecular spectra
+    """
+
+    def __init__(
+        self,
+        hidden_channels,
+        out_channels,
+    ):
+        super(GatedEquivariantBlock, self).__init__()
+        self.out_channels = out_channels
+
+        self.vec1_proj = nn.Linear(hidden_channels, hidden_channels, bias=False)
+        self.vec2_proj = nn.Linear(hidden_channels, out_channels, bias=False)
+
+        self.update_net = nn.Sequential(
+            nn.Linear(hidden_channels * 2, hidden_channels),
+            ScaledSiLU(),
+            nn.Linear(hidden_channels, out_channels * 2),
+        )
+
+        self.act = ScaledSiLU()
+
+    def reset_parameters(self):
+        nn.init.xavier_uniform_(self.vec1_proj.weight)
+        nn.init.xavier_uniform_(self.vec2_proj.weight)
+        nn.init.xavier_uniform_(self.update_net[0].weight)
+        self.update_net[0].bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.update_net[2].weight)
+        self.update_net[2].bias.data.fill_(0)
+
+    def forward(self, x, v):
+        vec1 = torch.norm(self.vec1_proj(v), dim=-2)
+        vec2 = self.vec2_proj(v)
+
+        x = torch.cat([x, vec1], dim=-1)
+        x, v = torch.split(self.update_net(x), self.out_channels, dim=-1)
+        v = v.unsqueeze(1) * vec2
+
+        x = self.act(x)
+        return x, v
diff --git a/ocpmodels/models/painn_nb6_scaling_factors.pt b/ocpmodels/models/painn_nb6_scaling_factors.pt
new file mode 100644
index 0000000000000000000000000000000000000000..3843d7c80738debaba50a0dc1318caa2041296c8
GIT binary patch
literal 2199
zcmbuB&1(}u7>6fGo2IKKv1-+-t=~<x&39UD3YDO_?7=vu5_(zI&C(67P2SC<7ZF5-
z(2IY9ck$p&#DAeDOF(+?BzWmrJPDoMY0J#WR!9bxVYBb=d3HDNWKvN>LkLA8=${cs
zV@P)ltFd9G>N?Srt!H+*6h*7TcolpJC{Y`y@lLC*H9JIW8@jDK-ehWN2pFVEF-fgM
z8g^UJ9P??XVb`^m?&!}=Vmb;0eM*SA-flYDMpl7PB2gQmk~W3uYbFe_h*GwjI(f`K
zRA9Ikpfza-*FtnQS{)cs0|X+ER5UZQX&6o0HVx8fuD6wkPTh3O`u&DM6d0Yspb$8Q
z;rOQIo}B1H6hqAaxiS#9f)>3iYsg~O44kw)+ZeNXoKpl&V>rX0<6St5;haA-3+E+h
z7A6?<0tdZF;1Y&O2A%4{WeivRp*grJL3418L9cVr8w74*m}byK7wAj4<qyrnZ3&u(
zSq7csph*HL3~2_<bRml&=MODFUV;{&z@YOSbb-Jf40joHu?s~EOPypz9f>UNKH1s~
zBIF(w-^PxjqR`l>bh)%3wYAXWN1g0`c)RSXzr-e{=t4s7ag_R=sLqs2OZ}?7+nP_9
zp8wa6&vqaF?4bTXU+bQKb#&`NAO75+{@2@v=l{6!Dc*-aKdArj3+8|KX7zU;{=%UC
z?>oBp{I%(+?f?5@X!)@JHcY2Ye+kQ_;vfx}Q{3+V_I_=Jt~ul$6*b^ahkFXYDC6!0
zVaPj-oaWb6R7mbn3!hG<<t)D@#B7yr4?;F0XZd~>vpqSRm9u;=irFgtiUe;?&hmXE
zW_xlrFK7845VO@`$y<=K{5KdA-gA%3?)OYZ4Top<izY*Jhvf(VA`a%C?ruy64zro&
X+&vr>_6yw06?NFV8bJ}{*0b6@E_?cz

literal 0
HcmV?d00001


From 7b1d7a68f7fba64993f99256a55783cbabd905a8 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:06:31 -0500
Subject: [PATCH 265/273] Add COMENET model

---
 configs/models/comenet.yaml | 62 +++++++++++++++++++++++++++++++++++++
 ocpmodels/models/comenet.py | 42 +++++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 configs/models/comenet.yaml
 create mode 100644 ocpmodels/models/comenet.py

diff --git a/configs/models/comenet.yaml b/configs/models/comenet.yaml
new file mode 100644
index 0000000000..392bdfdc76
--- /dev/null
+++ b/configs/models/comenet.yaml
@@ -0,0 +1,62 @@
+default:
+  model:
+    name: comenet
+    use_pbc: True
+
+# -------------------
+# -----  IS2RE  -----
+# -------------------
+
+is2re:
+  10k: {}
+  100k: {}
+  all: {}
+
+# ------------------
+# -----  S2EF  -----
+# ------------------
+
+s2ef:
+  default: {}
+  200k: {}
+  2M: {}
+  20M: {}
+  all: {}
+
+qm9:
+  default:
+    model:
+      cutoff: 5.0
+      num_layers: 5
+      hidden_channels: 256
+      out_channels: 1
+      num_radial: 3
+      num_spherical: 2
+      num_output_layers: 3
+    optim:
+      batch_size: 32
+      num_workers: 4
+      lr_initial: 0.001
+      max_epochs: 1000
+      decay_steps: 125000
+      decay_rate: 0.01
+      ema_decay: 0.999
+      # all below is for the scheduler
+      scheduler: ReduceLROnPlateau
+      mode: min
+      factor: 0.95
+      threshold: 0.0001
+      threshold_mode: abs
+      min_lr: 0.000001
+      verbose: true
+      patience: 10
+
+  10k: {}
+  all: {}
+
+qm7x:
+  default: {}
+  all: {}
+  1k: {}
+
+
diff --git a/ocpmodels/models/comenet.py b/ocpmodels/models/comenet.py
new file mode 100644
index 0000000000..ec8ada6e54
--- /dev/null
+++ b/ocpmodels/models/comenet.py
@@ -0,0 +1,42 @@
+from dig.threedgraph.method import ComENet as DIGComENet
+from ocpmodels.models.base_model import BaseModel
+import torch
+from ocpmodels.common.registry import registry
+from ocpmodels.common.utils import conditional_grad
+from copy import deepcopy
+
+
+@registry.register_model("comenet")
+class ComENet(BaseModel):
+    def __init__(self, **kwargs):
+        super().__init__()
+        self.regress_forces = False
+        self.cutoff = kwargs.get("cutoff", 5.0)
+        self.num_layers = kwargs.get("num_layers", 4)
+        self.hidden_channels = kwargs.get("hidden_channels", 128)
+        self.out_channels = kwargs.get("out_channels", 1)
+        self.num_spherical = kwargs.get("num_spherical", 3)
+        self.num_radial = kwargs.get("num_radial", 6)
+        self.num_output_layers = kwargs.get("num_output_layers", 3)
+        self.comenet = DIGComENet(
+            cutoff=self.cutoff,
+            num_layers=self.num_layers,
+            hidden_channels=self.hidden_channels,
+            out_channels=self.out_channels,
+            num_spherical=self.num_spherical,
+            num_radial=self.num_radial,
+            num_output_layers=self.num_output_layers,
+        )
+
+    @conditional_grad(torch.enable_grad())
+    def forces_forward(self, preds):
+        return
+
+    @conditional_grad(torch.enable_grad())
+    def energy_forward(self, data):
+        # Rewire the graph
+        z = data.atomic_numbers.long()
+        batch_data = deepcopy(data)
+        batch_data.z = z
+
+        return {"energy": self.comenet.forward(batch_data)}

From 16c0f9eb10b58ace9c88868c7082717e0ed94e4f Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:06:45 -0500
Subject: [PATCH 266/273] Add SPHERENET model

---
 configs/models/spherenet.yaml | 20 +++++++++++---------
 ocpmodels/models/spherenet.py |  4 +++-
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/configs/models/spherenet.yaml b/configs/models/spherenet.yaml
index 1cea3c24bc..32c0d7b6df 100644
--- a/configs/models/spherenet.yaml
+++ b/configs/models/spherenet.yaml
@@ -43,20 +43,22 @@ qm9:
       out_channels: 1
       out_emb_channels: 256
     optim:
-      batch_size: 1024
+      batch_size: 32
+      num_workers: 4
       lr_initial: 0.001
       max_epochs: 1000
       decay_steps: 125000
       decay_rate: 0.01
       ema_decay: 0.999
-      lr_gamma: 0.25
-      lr_milestones:
-        - 17981
-        - 26972
-        - 35963
-        - 52000
-        - 100000
-      warmup_steps: 1000
+      # all below is for the scheduler
+      scheduler: ReduceLROnPlateau
+      mode: min
+      factor: 0.95
+      threshold: 0.0001
+      threshold_mode: abs
+      min_lr: 0.000001
+      verbose: true
+      patience: 10
 
   10k: {}
   all: {}
diff --git a/ocpmodels/models/spherenet.py b/ocpmodels/models/spherenet.py
index 7a5be57181..df0024fe8e 100644
--- a/ocpmodels/models/spherenet.py
+++ b/ocpmodels/models/spherenet.py
@@ -6,10 +6,12 @@
 from copy import deepcopy
 
 
+@registry.register_model("spherenet")
 class SphereNet(BaseModel):
     def __init__(self, **kwargs):
         super().__init__()
         self.energy_and_force = kwargs.get("energy_and_force", False)
+        self.regress_forces = "from_energy" if self.energy_and_force else False
         self.cutoff = kwargs.get("cutoff", 5.0)
         self.num_layers = kwargs.get("num_layers", 4)
         self.hidden_channels = kwargs.get("hidden_channels", 128)
@@ -55,4 +57,4 @@ def energy_forward(self, data):
         batch_data = deepcopy(data)
         batch_data.z = z
 
-        return self.spherenet.forward(batch_data)
+        return {"energy": self.spherenet.forward(batch_data)}

From f8882ec4912bd72acf25b7bd1559f9c6ce238159 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:07:02 -0500
Subject: [PATCH 267/273] update dimenet (not ++)

---
 configs/models/dimenet.yaml  | 69 ++++++++++++++++++++++++++++++++++++
 ocpmodels/models/__init__.py |  2 +-
 ocpmodels/models/dimenet.py  | 37 +++++++++++++++++--
 3 files changed, 105 insertions(+), 3 deletions(-)
 create mode 100644 configs/models/dimenet.yaml

diff --git a/configs/models/dimenet.yaml b/configs/models/dimenet.yaml
new file mode 100644
index 0000000000..992296d8a8
--- /dev/null
+++ b/configs/models/dimenet.yaml
@@ -0,0 +1,69 @@
+default:
+  model:
+    name: dimenet
+    use_pbc: False
+
+# -------------------
+# -----  IS2RE  -----
+# -------------------
+
+is2re:
+  10k: {}
+  100k: {}
+  all: {}
+
+# ------------------
+# -----  S2EF  -----
+# ------------------
+
+s2ef:
+  default: {}
+  200k: {}
+  2M: {}
+  20M: {}
+  all: {}
+
+qm9:
+  default:
+    model:
+      hidden_channels: 128
+      out_channels: 1
+      num_blocks: 6
+      num_bilinear: 8
+      num_spherical: 6
+      num_radial: 6
+      cutoff: 5.0
+      max_num_neighbors: 40
+      envelope_exponent: 5
+      num_before_skip: 1
+      num_after_skip:  2
+      num_output_layers: 3
+      act: swish
+      regress_forces: False
+    optim:
+      batch_size: 32
+      num_workers: 4
+      lr_initial: 0.001
+      max_epochs: 1000
+      decay_steps: 125000
+      decay_rate: 0.01
+      ema_decay: 0.999
+      # all below is for the scheduler
+      scheduler: ReduceLROnPlateau
+      mode: min
+      factor: 0.95
+      threshold: 0.0001
+      threshold_mode: abs
+      min_lr: 0.000001
+      verbose: true
+      patience: 10
+
+  10k: {}
+  all: {}
+
+qm7x:
+  default: {}
+  all: {}
+  1k: {}
+
+
diff --git a/ocpmodels/models/__init__.py b/ocpmodels/models/__init__.py
index ad3c0ccce9..9ad38b86bc 100644
--- a/ocpmodels/models/__init__.py
+++ b/ocpmodels/models/__init__.py
@@ -5,7 +5,7 @@
 
 from .base_model import BaseModel  # noqa: F401
 from .cgcnn import CGCNN  # noqa: F401
-from .dimenet import DimeNetWrap as DimeNet  # noqa: F401
+from .dimenet import DimeNet  # noqa: F401
 from .old_dimenet_plus_plus import (  # noqa: F401
     DimeNetPlusPlusWrap as OldDimeNetPlusPlus,
 )
diff --git a/ocpmodels/models/dimenet.py b/ocpmodels/models/dimenet.py
index 341dee6da0..f74c58c644 100644
--- a/ocpmodels/models/dimenet.py
+++ b/ocpmodels/models/dimenet.py
@@ -6,11 +6,12 @@
 """
 
 import torch
-from torch_geometric.nn import DimeNet, radius_graph
+from torch_geometric.nn import DimeNet as PYGDimeNet, radius_graph
 from torch_scatter import scatter
 from torch_sparse import SparseTensor
 
 from ocpmodels.common.registry import registry
+from ocpmodels.models.base_model import BaseModel
 from ocpmodels.common.utils import (
     conditional_grad,
     get_pbc_distances,
@@ -18,8 +19,40 @@
 )
 
 
+@registry.register_model("dimenet")
+class DimeNet(BaseModel):
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self.regress_forces = bool(kwargs.get("regress_forces"))
+        self.dimenet = PYGDimeNet(
+            hidden_channels=kwargs.get("hidden_channels"),
+            out_channels=kwargs.get("out_channels"),
+            num_blocks=kwargs.get("num_blocks"),
+            num_bilinear=kwargs.get("num_bilinear"),
+            num_spherical=kwargs.get("num_spherical"),
+            num_radial=kwargs.get("num_radial"),
+            cutoff=kwargs.get("cutoff"),
+            max_num_neighbors=kwargs.get("max_num_neighbors"),
+            envelope_exponent=kwargs.get("envelope_exponent"),
+            num_before_skip=kwargs.get("num_before_skip"),
+            num_after_skip=kwargs.get("num_after_skip"),
+            num_output_layers=kwargs.get("num_output_layers"),
+            act=kwargs.get("act"),
+        )
+
+    @conditional_grad(torch.enable_grad())
+    def energy_forward(self, data):
+        return {
+            "energy": self.dimenet.forward(data.atomic_numbers, data.pos, data.batch)
+        }
+
+    @conditional_grad(torch.enable_grad())
+    def forces_forward(self, preds):
+        return
+
+
 @registry.register_model("old_dimenet")
-class DimeNetWrap(DimeNet):
+class OldDimeNetWrap(PYGDimeNet):
     r"""Wrapper around the directional message passing neural network (DimeNet) from the
     `"Directional Message Passing for Molecular Graphs"
     <https://arxiv.org/abs/2003.03123>`_ paper.

From fc5e6f2a3331eaf7f1d95ea43887fe6a9d1b3d1b Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:07:49 -0500
Subject: [PATCH 268/273] define `model_config`outside of model init to prevent
 kwarg duplication

---
 ocpmodels/trainers/base_trainer.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 13252ad414..3947f144a2 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -356,12 +356,18 @@ def load_model(self):
             if hasattr(sample, "x") and hasattr(sample.x, "shape"):
                 num_atoms = sample.x.shape[-1]
 
-        self.model = registry.get_model_class(self.config["model_name"])(
-            num_atoms=num_atoms,
-            bond_feat_dim=bond_feat_dim,
-            num_targets=self.num_targets,
-            task_name=self.task_name,
+        model_config = {
+            **{
+                "num_atoms": num_atoms,
+                "bond_feat_dim": bond_feat_dim,
+                "num_targets": self.num_targets,
+                "task_name": self.task_name,
+            },
             **self.config["model"],
+        }
+
+        self.model = registry.get_model_class(self.config["model_name"])(
+            **model_config
         ).to(self.device)
 
         if dist_utils.is_master() and not self.silent:

From deae37669e7166a8a0566f4268661e0cfd42f8b0 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:08:18 -0500
Subject: [PATCH 269/273] enable `eval_all_splits` from a ckpt file path

---
 ocpmodels/trainers/base_trainer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/trainers/base_trainer.py b/ocpmodels/trainers/base_trainer.py
index 3947f144a2..211662a443 100644
--- a/ocpmodels/trainers/base_trainer.py
+++ b/ocpmodels/trainers/base_trainer.py
@@ -808,7 +808,7 @@ def save_results(self, predictions, results_file, keys):
             np.savez_compressed(full_path, **gather_results)
 
     def eval_all_splits(
-        self, final=True, disable_tqdm=True, debug_batches=-1, epoch=-1
+        self, final=True, disable_tqdm=True, debug_batches=-1, epoch=-1, from_ckpt=None
     ):
         """Evaluate model on all four validation splits"""
 
@@ -824,7 +824,9 @@ def eval_all_splits(
             logging.info(f"Evaluating on {len(all_splits)} val splits.")
 
         # Load current best checkpoint for final evaluation
-        if final and epoch != 0:
+        if from_ckpt:
+            self.load_checkpoint(checkpoint_path=from_ckpt)
+        elif final and epoch != 0:
             checkpoint_path = os.path.join(
                 self.config["checkpoint_dir"], "best_checkpoint.pt"
             )

From d5d15b157c4eedfdc4e890627afa90fe16b276a6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:08:40 -0500
Subject: [PATCH 270/273] `end_of_training` from ckpt

---
 ocpmodels/trainers/single_trainer.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index d5a3d2a197..6ac75b81cf 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -391,9 +391,23 @@ def train(self, disable_eval_tqdm=True, debug_batches=-1):
                 epoch_int, debug_batches, model_run_time, epoch_times
             )
 
-    def end_of_training(self, epoch_int, debug_batches, model_run_time, epoch_times):
+    def end_of_training(
+        self,
+        epoch_int,
+        debug_batches,
+        model_run_time,
+        epoch_times,
+        from_ckpt=None,
+        disable_tqdm=True,
+    ):
 
-        eas = self.eval_all_splits(True, epoch=epoch_int, debug_batches=debug_batches)
+        eas = self.eval_all_splits(
+            True,
+            epoch=epoch_int,
+            debug_batches=debug_batches,
+            from_ckpt=from_ckpt,
+            disable_tqdm=disable_tqdm,
+        )
         if eas == "SIGTERM":
             return "SIGTERM"
 

From e930138c28b4772579d91f746e89fabeffd030f6 Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:08:56 -0500
Subject: [PATCH 271/273] print symmetry results if not silent

---
 ocpmodels/trainers/single_trainer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ocpmodels/trainers/single_trainer.py b/ocpmodels/trainers/single_trainer.py
index 6ac75b81cf..c9be544f15 100644
--- a/ocpmodels/trainers/single_trainer.py
+++ b/ocpmodels/trainers/single_trainer.py
@@ -438,6 +438,8 @@ def end_of_training(
                 return "SIGTERM"
             if self.logger:
                 self.logger.log(symmetry)
+            if not self.silent:
+                print(symmetry)
 
         # TODO: Test equivariance
 

From 7765e6eb659989d399edb234a0222912aba7212c Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:09:10 -0500
Subject: [PATCH 272/273] rename former eval script

---
 ...odels_on_all_splits.py => legacy_eval_models_on_all_splits.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/{eval_models_on_all_splits.py => legacy_eval_models_on_all_splits.py} (100%)

diff --git a/scripts/eval_models_on_all_splits.py b/scripts/legacy_eval_models_on_all_splits.py
similarity index 100%
rename from scripts/eval_models_on_all_splits.py
rename to scripts/legacy_eval_models_on_all_splits.py

From d5afd6b5e5ef5a2f919e8685f72fb88507cf138d Mon Sep 17 00:00:00 2001
From: Victor Schmidt <vsch@pm.me>
Date: Wed, 8 Feb 2023 12:09:33 -0500
Subject: [PATCH 273/273] new eval script with `continue_from_dir` and 
 `end_of_training`

---
 scripts/eval_model.py | 56 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 scripts/eval_model.py

diff --git a/scripts/eval_model.py b/scripts/eval_model.py
new file mode 100644
index 0000000000..bcf2ef7502
--- /dev/null
+++ b/scripts/eval_model.py
@@ -0,0 +1,56 @@
+import sys
+from copy import deepcopy
+from pathlib import Path
+
+from minydra import resolved_args
+
+sys.path.append(str(Path(__file__).resolve().parent.parent))
+
+from ocpmodels.common.flags import flags
+from ocpmodels.common.utils import build_config, resolve, setup_imports, merge_dicts
+from ocpmodels.trainers.single_trainer import SingleTrainer
+
+if __name__ == "__main__":
+
+    args = resolved_args(
+        defaults={
+            "job_id": None,
+            "dir": None,
+            "config": {},
+        },
+        strict=False,
+    )
+    assert (
+        args.job_id is not None or args.dir is not None
+    ), "Must specify either job_id or dir."
+
+    path = (
+        resolve(args.dir)
+        if args.dir is not None
+        else resolve("$SCRATCH/ocp/runs") / str(args.job_id)
+    )
+
+    setup_imports()
+    argv = deepcopy(sys.argv)
+    sys.argv[1:] = []
+    trainer_args = flags.parser.parse_args()
+    sys.argv[1:] = argv
+    trainer_args.continue_from_dir = str(path)
+    config = build_config(trainer_args, [])
+    config["logger"] = "dummy"
+    config["checkpoint"] = str(path / "checkpoints" / "best_checkpoint.pt")
+    config = merge_dicts(config, args.config)
+
+    trainer = SingleTrainer(**config)
+
+    trainer.silent = False
+    trainer.eval_on_test = True
+
+    trainer.end_of_training(
+        -1,
+        -1,
+        -1,
+        [-1],
+        from_ckpt=config["checkpoint"],
+        disable_tqdm=False,
+    )