diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fbecc133..902a148d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,6 +7,13 @@ on: branches: [main] release: types: [published] + workflow_dispatch: + inputs: + task: + type: choice + options: [tests, release] + default: tests + description: Only run tests or release a new version of pymatgen to PyPI after tests pass. jobs: tests: diff --git a/matbench_discovery/plots.py b/matbench_discovery/plots.py index 5f0f011a..6b8d5ae3 100644 --- a/matbench_discovery/plots.py +++ b/matbench_discovery/plots.py @@ -65,6 +65,7 @@ def unit(text: str) -> str: ) model_labels = dict( alignn="ALIGNN", + alignn_ff="ALIGNN FF", alignn_pretrained="ALIGNN Pretrained", bowsr_megnet="BOWSR", chgnet="CHGNet", diff --git a/matbench_discovery/preds.py b/matbench_discovery/preds.py index 097e1094..3d70663c 100644 --- a/matbench_discovery/preds.py +++ b/matbench_discovery/preds.py @@ -65,6 +65,7 @@ class PredFiles(Files): alignn = "alignn/2023-06-02-alignn-wbm-IS2RE.csv.gz" # alignn_pretrained = "alignn/2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz" + alignn_ff = "alignn_ff/2023-07-11-alignn-ff-wbm-IS2RE.csv.gz" # model_labels remaps model keys to pretty plot labels (see Files) diff --git a/models/alignn/metadata.yml b/models/alignn/metadata.yml index f7ed783a..520609ad 100644 --- a/models/alignn/metadata.yml +++ b/models/alignn/metadata.yml @@ -11,7 +11,6 @@ authors: - name: Brian DeCost affiliation: National Institute of Standards and Technology orcid: https://orcid.org/0000-0002-3459-5888 - email: zhongpc@berkeley.edu - name: Philipp Benner affiliation: Bundesanstalt für Materialforschung und -prüfung BAM orcid: https://orcid.org/0000-0002-0912-8137 @@ -19,7 +18,7 @@ authors: repo: https://github.com/usnistgov/alignn url: https://jarvis.nist.gov/jalignn doi: https://nature.com/articles/s41524-021-00650-1 -preprint: https://arxiv.org/abs/2209.05554 +preprint: https://arxiv.org/abs/2106.01829 requirements: ase: 3.22.0 dgl-cu111: 0.6.1 diff --git a/models/alignn/readme.md b/models/alignn/readme.md index 85a59d01..34330676 100644 --- a/models/alignn/readme.md +++ b/models/alignn/readme.md @@ -20,6 +20,5 @@ Replace `/path/to/` with the actual path to the patch file. The directory contains the following files, which must be executed in the given order to reproduce the results: -1. `train_data.py`: Export Matbench Discovery training data to ALIGNN compatible format. This script outputs training data in the directory `data_train`. In addition, a small test data set is set apart and stored in the directory `data_test` -1. `train_alignn.py`: Train an ALIGNN model on previously exported data. The resulting model is stored in the directory `data-train-result` -1. `test_alignn.py`: Test a trained ALIGNN model on the WBM data. Generates `2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz`. +1. `train_alignn.py`: Train an ALIGNN model on all 154k MP computed structure entries. The resulting model checkpoint is saved to the `out_dir` variable in that script and also uploaded to `wandb` from where it is publicly available for 3rd party reproducibility. +1. `test_alignn.py`: Test a trained ALIGNN model on the WBM data. Generated `2023-06-03-mp-e-form-alignn-wbm-IS2RE.csv.gz`. diff --git a/models/alignn/test_alignn.py b/models/alignn/test_alignn.py index 30a703a2..eb3abce1 100644 --- a/models/alignn/test_alignn.py +++ b/models/alignn/test_alignn.py @@ -30,6 +30,7 @@ # %% model_name = "mp_e_form_alignn" # pre-trained by NIST +# TODO fix this to load checkpoint from figshare # model_name = f"{module_dir}/data-train-result/best-model.pth" task_type = "IS2RE" target_col = "e_form_per_atom_mp2020_corrected" diff --git a/models/alignn_ff/2023-07-11-alignn-ff-wbm-IS2RE.csv.gz b/models/alignn_ff/2023-07-11-alignn-ff-wbm-IS2RE.csv.gz new file mode 100644 index 00000000..e1da1929 Binary files /dev/null and b/models/alignn_ff/2023-07-11-alignn-ff-wbm-IS2RE.csv.gz differ diff --git a/models/alignn_ff/alignn-ff-2023.07.05.patch b/models/alignn_ff/alignn-ff-2023.07.05.patch new file mode 100644 index 00000000..4e16c8ad --- /dev/null +++ b/models/alignn_ff/alignn-ff-2023.07.05.patch @@ -0,0 +1,73 @@ +diff --git a/alignn/ff/ff.py b/alignn/ff/ff.py +index 2dc916f..a569184 100644 +--- a/alignn/ff/ff.py ++++ b/alignn/ff/ff.py +@@ -46,6 +46,8 @@ from jarvis.analysis.defects.surface import Surface + # from jarvis.core.kpoints import Kpoints3D as Kpoints + # from jarvis.core.atoms import get_supercell_dims + ++import torch ++ + try: + from gpaw import GPAW, PW + except Exception: +@@ -62,7 +64,6 @@ __author__ = "Kamal Choudhary, Brian DeCost, Keith Butler, Lily Major" + def default_path(): + """Get default model path.""" + dpath = os.path.abspath(str(os.path.join(os.path.dirname(__file__), "."))) +- print("model_path", dpath) + return dpath + + +@@ -138,8 +139,6 @@ class AlignnAtomwiseCalculator(ase.calculators.calculator.Calculator): + + config.model.output_features = 1 + +- import torch +- + if self.device is None: + self.device = torch.device( + "cuda" if torch.cuda.is_available() else "cpu" +@@ -193,6 +192,7 @@ class ForceField(object): + logfile="alignn_ff.log", + dyn=None, + communicator=None, ++ device="cuda" if torch.cuda.is_available() else "cpu", + ): + """Initialize class.""" + self.jarvis_atoms = jarvis_atoms +@@ -225,12 +225,13 @@ class ForceField(object): + # print ('STRUCTURE PROVIDED:') + # print (ase_to_atoms(self.atoms)) + # print () ++ import torch + self.atoms.set_calculator( + AlignnAtomwiseCalculator( + path=self.model_path, + include_stress=self.include_stress, + model_filename=self.model_filename, +- # device="cuda" if torch.cuda.is_available() else "cpu", ++ device=device, + ) + ) + +@@ -238,6 +239,7 @@ class ForceField(object): + """Print info.""" + if isinstance(self.atoms, ExpCellFilter): + self.atoms = self.atoms.atoms ++ return + line = "" + try: + line = f"time={self.dyn.get_time() / units.fs: 5.0f} fs " +@@ -297,9 +299,9 @@ class ForceField(object): + raise ValueError("Check optimizer", optimizer) + if optimize_lattice: + self.atoms = ExpCellFilter(self.atoms) +- print("OPTIMIZATION") ++ + self.dyn = optimizer( +- self.atoms, trajectory="opt.traj", logfile="opt.log" ++ self.atoms, trajectory=trajectory, logfile=logfile + ) + self.dyn.attach(self.print_format, interval=interval) + self.dyn.run(fmax=fmax, steps=steps) diff --git a/models/alignn_ff/alignn_ff_relax.py b/models/alignn_ff/alignn_ff_relax.py new file mode 100644 index 00000000..cb84c59d --- /dev/null +++ b/models/alignn_ff/alignn_ff_relax.py @@ -0,0 +1,108 @@ +# %% +from __future__ import annotations + +import os + +import numpy as np +import pandas as pd +from pymatgen.core import Structure +from pymatgen.io.jarvis import JarvisAtomsAdaptor +from tqdm import tqdm + +from matbench_discovery import DEBUG, today +from matbench_discovery.data import DATA_FILES, df_wbm + +__author__ = "Janosh Riebesell, Philipp Benner" +__date__ = "2023-07-11" + + +# %% read environment variables +batch = int(os.getenv("TASK_ID", default="0")) +out_dir = os.getenv("SBATCH_OUTPUT", default=f"{today}-alignn-wbm-IS2RE") + + +# %% +n_splits = 100 +n_processes_per_task = 10 +module_dir = os.path.dirname(__file__) +# model_name = "mp_e_form_alignn" # pre-trained by NIST +model_name = f"{out_dir}/best-model.pth" +task_type = "IS2RE" +target_col = "e_form_per_atom_mp2020_corrected" +input_col = "initial_structure" +id_col = "material_id" +job_name = f"{model_name}-wbm-{task_type}{'-debug' if DEBUG else ''}" +out_path = ( + f"{out_dir}/{'alignn-relaxed-structs' if batch == 0 else f'{batch=}'}.json.gz" +) + +if batch < 0 or batch > n_splits: + raise SystemExit(f"Invalid task_id={batch}") +if batch > 0 and not os.path.exists(out_dir): + os.mkdir(out_dir) +if os.path.isfile(out_path): + raise SystemExit(f"{out_path = } already exists, exiting") + + +# %% Load data +data_path = { + "IS2RE": DATA_FILES.wbm_initial_structures, + "RS2RE": DATA_FILES.wbm_computed_structure_entries, +}[task_type] +input_col = {"IS2RE": "initial_structure", "RS2RE": "relaxed_structure"}[task_type] + +df_in = pd.read_json(data_path).set_index(id_col) + +df_in[target_col] = df_wbm[target_col] +if task_type == "RS2RE": + df_in[input_col] = [x["structure"] for x in df_in.computed_structure_entry] +assert input_col in df_in, f"{input_col=} not in {list(df_in)}" + +# Split data into parts and process only one batch +if batch != 0: + df_in = np.array_split(df_in, 100)[batch - 1] + print(f"Relaxing materials in range {df_in.index[0]} - {df_in.index[-1]}") +else: + print("Relaxing full range of materials") + + +# %% Relax structures +def alignn_relax(structure: Structure) -> Structure: + """Relax structure using Alignn FF. + + Args: + structure (Structure): pymatgen object to relax. + + Returns: + Structure: Relaxed structure. + """ + # Cuda must be only initialized in child processes + import torch + from alignn.ff.ff import ForceField, default_path + + ff = ForceField( + jarvis_atoms=JarvisAtomsAdaptor.get_atoms(Structure.from_dict(structure)), + model_path=default_path(), + device=f"cuda:{batch % 4}" if torch.cuda.is_available() else "cpu", + logfile="/dev/null", + ) + # Relax structure + opt, _, _ = ff.optimize_atoms(trajectory=None, logfile="/dev/null") + + return JarvisAtomsAdaptor.get_structure(opt) + + +structures = [ + df_in.loc[material_id]["initial_structure"] for material_id in df_in.index +] +df_relaxed = tqdm(structures, alignn_relax, n_jobs=n_processes_per_task) + +df_in = df_in.assign(relaxed_structure=df_relaxed) + + +# %% save results +df_in.to_json(out_path) + +# Examples of materials that take ages to converge: +# task_id = 75, df_in.iloc[856]: wbm-3-76848 +# task_id = 75, df_in.iloc[986]: wbm-3-76978 diff --git a/models/alignn_ff/metadata_aborted.yml b/models/alignn_ff/metadata_aborted.yml new file mode 100644 index 00000000..0b0f3ed0 --- /dev/null +++ b/models/alignn_ff/metadata_aborted.yml @@ -0,0 +1,42 @@ +model_name: ALIGNN FF +model_version: 2023.07.01 +matbench_discovery_version: 1.0 +date_added: "2023-07-11" +date_published: "2022-09-16" +authors: + - name: Kamal Choudhary + affiliation: National Institute of Standards and Technology + email: kamal.choudhary@nist.gov + orcid: https://orcid.org/0000-0001-9737-8074 + - name: Brian DeCost + affiliation: National Institute of Standards and Technology + orcid: https://orcid.org/0000-0002-3459-5888 + - name: Lily Major + affiliation: Aberystwyth University, UK + orcid: https://orcid.org/0000-0002-5783-8432 + - name: Keith Butler + affiliation: Rutherford Appleton Laboratory, UK + orcid: https://orcid.org/0000-0001-5432-5597 + - name: Jeyan Thiyagalingam + affiliation: Rutherford Appleton Laboratory, UK + orcid: https://orcid.org/0000-0002-2167-1343 + - name: Francesca Tavazza + affiliation: National Institute of Standards and Technology + orcid: https://orcid.org/0000-0002-5602-180X + - name: Philipp Benner + affiliation: Bundesanstalt für Materialforschung und -prüfung BAM + orcid: https://orcid.org/0000-0002-0912-8137 + github: https://github.com/pbenner +repo: https://github.com/usnistgov/alignn +url: https://jarvis.nist.gov/jalignn +doi: https://doi.org/10.1039/D2DD00096B +preprint: https://arxiv.org/abs/2209.05554 +requirements: + ase: 3.22.0 + dgl-cu111: 0.6.1 + numpy: 1.24.3 + pandas: 2.0.1 + scikit-learn: 1.2.2 + torch: 1.9.0+cu111 +trained_for_benchmark: false +# hyperparams: see align-config.json diff --git a/models/alignn_ff/readme.md b/models/alignn_ff/readme.md new file mode 100644 index 00000000..1b6c7979 --- /dev/null +++ b/models/alignn_ff/readme.md @@ -0,0 +1,39 @@ +# ALIGNN-FF (aborted) + +The [ALIGNN FF model submission](https://github.com/janosh/matbench-discovery/pull/47) intended to get a complete set of formation energy predictions for the WBM test set post-ALIGNN-FF structure relaxation (i.e. the WBM IS2RE task). + +This effort was aborted for the following reasons: + +1. **Incompatibility issues**: ALIGNN-FF was pre-trained on the JARVIS data, which among other differences uses the OptB88vdW functional and is incompatible with the WBM test set generated using Materials Project workflows. +1. **Training difficulties**: ALIGNN-FF proved to be very resource-hungry. [12 GB of MPtrj training data](https://figshare.com/articles/dataset/23713842) turned into 600 GB of ALIGNN graph data. This forces small batch size even on nodes with large GPU memory, which slowed down training. +1. **Ineffectiveness of fine-tuning**: Efforts to fine-tune the ALIGNN-FF WT10 model on the CHGNet data suffered high initial loss, even worse than the untrained model, indicating significant dataset incompatibility. + +The decision to abort adding ALIGNN FF to Matbench Discovery v1 was made after weeks of work due to ongoing technical challenges and resource limitations. See the [PR discussion](https://github.com/janosh/matbench-discovery/pull/47) for further details. + +## Fine-tuning + +We attempted fine-tuning the [`alignnff_wt10` checkpoint](https://github.com/usnistgov/alignn/blob/461b35fe6e5ed7ade7cbf9b345773e941371ecfc/alignn/ff/alignnff_wt10/best_model.pt). + +The patch `alignn-ff-2023.07.05.patch` fixes the following issue: + +```bash +Traceback (most recent call last): + File "alignn_relax.py", line 96, in + File "alignn_relax.py", line 88, in alignn_relax + File "../alignn/ff/ff.py", line 310, in optimize_atoms + File "../alignn/lib/python3.9/site-packages/ase/optimize/optimize.py", line 269, in run + File "../alignn/lib/python3.9/site-packages/ase/optimize/optimize.py", line 156, in run + File "../alignn/lib/python3.9/site-packages/ase/optimize/optimize.py", line 129, in irun + File "../alignn/lib/python3.9/site-packages/ase/optimize/optimize.py", line 108, in call_observers + File "../alignn/lib/python3.9/site-packages/ase/io/trajectory.py", line 132, in write + File "../alignn/lib/python3.9/site-packages/ase/io/trajectory.py", line 156, in _write_atoms + File "../alignn/lib/python3.9/site-packages/ase/io/trajectory.py", line 381, in write_atoms + File "../alignn/lib/python3.9/site-packages/ase/io/ulm.py", line 400, in write + File "../alignn/lib/python3.9/site-packages/ase/io/ulm.py", line 325, in fill +OSError: [Errno 24] Too many open files +``` + +## Scripts + +1. `alignn_ff_relax.py`: Relax WBM test set structures. Set the variable `n_splits` to the number of GPU compute nodes. On each compute node, set the environment variable `TASK_ID` to a value in the range 1-`n_splits`. Set the variable `n_processes_per_task` to the number of processes on a single node. For 48 CPU cores with 4 GPUs a good setting is to use 10 processes. +2. `test_alignn_ff.py`: Read the relaxed structures from `alignn_ff_relax.py` and make formation energy predictions. Set the variable `n_splits` accordingly. diff --git a/models/alignn_ff/test_alignn_ff.py b/models/alignn_ff/test_alignn_ff.py new file mode 100644 index 00000000..6fdf658c --- /dev/null +++ b/models/alignn_ff/test_alignn_ff.py @@ -0,0 +1,128 @@ +# %% +from __future__ import annotations + +import json +import os +from glob import glob +from importlib.metadata import version + +import pandas as pd +import torch +import wandb +from alignn.config import TrainingConfig +from alignn.models.alignn import ALIGNN +from alignn.pretrained import all_models, get_figshare_model +from jarvis.core.graphs import Graph +from pymatgen.core import Structure +from pymatgen.io.jarvis import JarvisAtomsAdaptor +from sklearn.metrics import r2_score +from tqdm import tqdm + +from matbench_discovery import DEBUG, today +from matbench_discovery.data import DATA_FILES, df_wbm +from matbench_discovery.plots import wandb_scatter + +__author__ = "Philipp Benner, Janosh Riebesell" +__date__ = "2023-07-11" + +module_dir = os.path.dirname(__file__) + + +# %% +n_splits = 100 +# model_name = "mp_e_form_alignnn" # pre-trained by NIST +task_type = "IS2RE" +target_col = "e_form_per_atom_mp2020_corrected" +input_col = "initial_structure" +id_col = "material_id" +device = "cuda" if torch.cuda.is_available() else "cpu" +model_name = f"alignn-ff-wbm-{task_type}" +job_name = f"{model_name}-relaxed-wbm-{task_type}{'-debug' if DEBUG else ''}" +out_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}") +in_dir = os.getenv("SBATCH_OUTPUT", f"{module_dir}/{today}-{job_name}") + + +if model_name in all_models: # load pre-trained model + model = get_figshare_model(model_name) + pred_col = "e_form_per_atom_alignn_pretrained" +elif os.path.isfile(model_name): + pred_col = "e_form_per_atom_alignn" + with open(f"{module_dir}/alignn-config.json") as file: + config = TrainingConfig(**json.load(file)) + + model = ALIGNN(config.model) + # load trained ALIGNN model + state_dict = torch.load(model_name, map_location=device) + model.load_state_dict(state_dict) + model = model.to(device) +else: + raise ValueError( + f"{model_name=} not found, train a model or use pre-trained {list(all_models)}" + ) + + +# %% Load data +data_path = { + "IS2RE": DATA_FILES.wbm_initial_structures, + "RS2RE": DATA_FILES.wbm_computed_structure_entries, +}[task_type] +input_col = "relaxed_structure" +# load ALIGNN-FF relaxed structures (TODO fix directory we're loading from) +df_in = pd.concat(map(pd.read_json, glob(f"{module_dir}/data-train-result/*.json.gz"))) + + +# %% +run_params = dict( + data_path=data_path, + **{f"{dep}_version": version(dep) for dep in ("alignn", "numpy")}, + model_name=model_name, + task_type=task_type, + target_col=target_col, + df=dict(shape=str(df_in.shape), columns=", ".join(df_in)), +) + +wandb.init(project="matbench-discovery", name=job_name, config=run_params) + + +# %% Predict +model.eval() +e_form_preds: dict[str, float] = {} +with torch.no_grad(): # get predictions + for material_id, structure in tqdm( + df_in[input_col].items(), + total=len(df_in), + desc=f"Predicting {target_col=} {task_type}", + ): + atoms = JarvisAtomsAdaptor.get_atoms(Structure.from_dict(structure)) + + atom_graph, line_graph = Graph.atom_dgl_multigraph(atoms) + e_form = model([atom_graph.to(device), line_graph.to(device)]).item() + + e_form_preds[material_id] = e_form + +df_wbm[pred_col] = e_form_preds + +df_wbm[pred_col] -= df_wbm.e_correction_per_atom_mp_legacy +df_wbm[pred_col] += df_wbm.e_correction_per_atom_mp2020 + +if model_name in all_models: + df_wbm[pred_col].round(4).to_csv( + f"{module_dir}/{today}-{model_name}-relaxed-wbm-IS2RE.csv.gz" + ) +else: + df_wbm[pred_col].round(4).to_csv( + f"{module_dir}/{today}-alignn-relaxed-wbm-IS2RE.csv.gz" + ) + + +# %% +df_wbm = df_wbm.dropna() + +table = wandb.Table(dataframe=df_wbm[[target_col, pred_col]].reset_index()) + +MAE = (df_wbm[target_col] - df_wbm[pred_col]).abs().mean() +R2 = r2_score(df_wbm[target_col], df_wbm[pred_col]) +title = f"{model_name} {task_type} {MAE=:.4} {R2=:.4}" +print(title) + +wandb_scatter(table, fields=dict(x=target_col, y=pred_col), title=title)