add 2023-01-25-rolling-mae-vs-hull-dist-models.svelte

fix color of SEM shaded area and rolling window annotation in rolling_mae_vs_hull_dist() for backend=plotly add site/src/figs/2023-01-26-wbm-each-hist.svelte to wbm/readme.md implement rolling_acc for backend=plotly branch of hist_classified_stable_vs_hull_dist()
janosh · Jun 20, 2023 · 9fed210 · 9fed210
1 parent 2f795f7
commit 9fed210
Show file tree

Hide file tree

Showing 26 changed files with 344 additions and 155 deletions.
diff --git a/data/mp/build_phase_diagram.py b/data/mp/build_phase_diagram.py
@@ -109,4 +109,4 @@
     xlabel="MP Formation Energy (eV/atom)",
     ylabel="Our Formation Energy (eV/atom)",
 )
-ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.png", dpi=300)
+ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.webp", dpi=300)
diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -80,7 +80,7 @@
 annotate_mae_r2(df.formation_energy_per_atom, df.decomposition_enthalpy)
 # result on 2023-01-10: plots match. no correlation between formation energy and decomposition
 # enthalpy. R^2 = -1.571, MAE = 1.604
-# ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.png", dpi=300)
+# ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.webp", dpi=300)
 
 
 # %% scatter plot energy above convex hull vs decomposition enthalpy
@@ -99,4 +99,4 @@
     title=f"{n_above_line:,} / {len(df):,} = {n_above_line/len(df):.1%} "
     "MP materials with\nenergy_above_hull - decomposition_enthalpy.clip(0) > 0.1"
 )
-# ax.figure.savefig(f"{module_dir}/{today}-mp-e-above-hull-vs-decomp-enth.png", dpi=300)
+# ax.figure.savefig(f"{module_dir}/{today}-mp-e-above-hull-vs-decomp-enth.webp", dpi=300)
diff --git a/data/wbm/analysis.py b/data/wbm/analysis.py
@@ -7,13 +7,15 @@
 
 from matbench_discovery import FIGS, today
 from matbench_discovery.data import df_wbm
-
-module_dir = os.path.dirname(__file__)
+from matbench_discovery.plots import pio
 
 """
 Compare MP and WBM elemental prevalence. Starting with WBM, MP below.
 """
 
+module_dir = os.path.dirname(__file__)
+print(f"{pio.templates.default=}")
+
 
 # %%
 wbm_elem_counts = count_elements(df_wbm.formula).astype(int)
@@ -81,3 +83,57 @@
 # %%
 mp_fig.write_image(f"{module_dir}/figs/{today}-mp-elements.svg", width=1000, height=500)
 # save_fig(mp_fig, f"{FIGS}/{today}-mp-elements.svelte")
+
+
+# %% histogram of energy above MP convex hull for WBM
+col = "e_above_hull_mp2020_corrected_ppd_mp"
+# col = "e_form_per_atom_mp2020_corrected"
+mean, std = df_wbm[col].mean(), df_wbm[col].std()
+
+fig = df_wbm[col].hist(
+    bins=100,
+    backend="plotly",
+    range_x=[mean - 2 * std, mean + 2 * std],
+    template="plotly_dark",
+)
+
+if col.startswith("e_above_hull"):
+    n_stable = sum(df_wbm[col] <= 0)
+    n_unstable = sum(df_wbm[col] > 0)
+    assert n_stable + n_unstable == len(df_wbm.dropna())
+
+    dummy_mae = (df_wbm[col] - df_wbm[col].mean()).abs().mean()
+
+    title = (
+        f"n={len(df_wbm.dropna()):,} with {n_stable:,} stable + {n_unstable:,} "
+        f"unstable, dummy MAE={dummy_mae:.2f}"
+    )
+    fig.update_layout(title=dict(text=title, x=0.5, y=0.95))
+
+fig.update_layout(showlegend=False, paper_bgcolor="rgba(0,0,0,0)")
+fig.update_xaxes(title_text="WBM energy above MP convex hull (eV/atom)")
+
+
+for x_pos, label in zip(
+    [mean, mean + std, mean - std],
+    [f"{mean = :.2f}", f"{mean + std = :.2f}", f"{mean - std = :.2f}"],
+):
+    fig.add_vline(x=x_pos, line=dict(width=1, dash="dash"))
+    fig.add_annotation(
+        x=x_pos,
+        y=0.95,
+        text=label,
+        showarrow=False,
+        yref="paper",
+        xanchor="left",
+        xshift=5,
+    )
+fig.show()
+
+
+# subsample x
+for trace in fig.data:
+    trace.x = trace.x[::8]
+
+save_fig(fig, f"{FIGS}/{today}-wbm-each-hist.svelte")
+save_fig(fig, f"./figs/{today}-wbm-each-hist.svg", width=1000, height=500)
diff --git a/data/wbm/fetch_process_wbm_dataset.py b/data/wbm/fetch_process_wbm_dataset.py
@@ -526,7 +526,7 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
     xlabel="legacy corrections (eV / atom)",
     ylabel="MP2020 corrections (eV / atom)",
 )
-# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
+# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.webp")
 
 
 # %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to

diff --git a/data/wbm/readme.md b/data/wbm/readme.md
@@ -45,9 +45,9 @@ The number of materials in each step before and after processing are:
 | before | 61,848 | 52,800 | 79,205 | 40,328 | 23,308 | 257,487 |
 | after  | 61,466 | 52,755 | 79,160 | 40,314 | 23,268 | 256,963 |
 
-## 🔗 &thinsp; Links to raw WBM Data Files
+## 🔗 &thinsp; Links to WBM Files
 
-Links to WBM data files have proliferated. This is an attempt to keep track of them.
+Links to raw WBM data files have proliferated. This is an attempt to keep track of them.
 
 Initial structures (after element substitution but before DFT relaxation) were sent as Google Drive links via email by Hai-Chen Wang on 2021-09-01.
 
@@ -72,18 +72,30 @@ materialscloud:2021.68 includes a readme file with a description of the dataset,
 
 [wbm paper]: https://nature.com/articles/s41524-020-00481-6
 
-## 📊 &thinsp; Chemical Diversity
+## 🧪 &thinsp; Chemical Diversity
 
-Both the WBM test set and even more so the MP training set are heavily oxide dominated. The WBM test set is about 75% larger than the MP training set and also more chemically diverse, containing a higher fraction of transition metals, post-transition metals and metalloids. Our goal in picking such a large diverse test set is future-proofing. Ideally, this data will provide a challenging materials discovery test bed even for large foundational ML models in the future.
+The WBM test set and even more so the MP training set are heavily oxide dominated. The WBM test set is about 75% larger than the MP training set and also more chemically diverse, containing a higher fraction of transition metals, post-transition metals and metalloids. Our goal in picking such a large diverse test set is future-proofing. Ideally, this data will provide a challenging materials discovery test bed even for large foundational ML models in the future.
+
+Element counts for WBM test set consisting of 256,963 WBM `ComputedStructureEntries`
 
 <slot name="wbm-elements-heatmap">
   <img src="./figs/2023-01-08-wbm-elements.svg" alt="Periodic table log heatmap of WBM elements">
 </slot>
-<caption>Element counts for test set consisting of 256,963 WBM <code>ComputedStructureEntries</code></caption>
 
-By comparison, the training set of MP ComputedStructureEntries has this element distribution.
+Element counts for MP training set consisting of 146,323 `ComputedStructureEntries`
 
 <slot name="mp-elements-heatmap">
   <img src="./figs/2023-01-08-mp-elements.svg" alt="Periodic table log heatmap of MP elements">
 </slot>
-<caption>Element counts for training set consisting of 146,323 MP <code>ComputedStructureEntries</code></caption>
+
+## 🎯 &thinsp; Target Distribution
+
+The WBM test set has an energy above the MP convex hull distribution with mean **0.02 eV/atom** and standard deviation of **0.25 eV/atom**.
+
+The dummy MAE of always predicting the test set mean is **0.17 eV/atom**.
+
+The number of stable materials is **97k** out of 257k, resulting in a dummy stability hit rate of **37%**.
+
+<slot name="wbm-each-hist">
+  <img src="./figs/2023-01-26-wbm-each-hist.svg" alt="WBM energy above MP convex hull distribution">
+</slot>
diff --git a/matbench_discovery/__init__.py b/matbench_discovery/__init__.py
@@ -5,9 +5,10 @@
 import sys
 from datetime import datetime
 
-ROOT = os.path.dirname(os.path.dirname(__file__))  # repository root
+ROOT = os.path.dirname(os.path.dirname(__file__))  # repo root
 FIGS = f"{ROOT}/site/src/figs"  # directory to store interactive figures
-STATIC = f"{ROOT}/site/static/figs"  # directory to store static figures
+STATIC = f"{ROOT}/site/static/figs"  # directory to store static figures, is symlinked
+# into site/src/routes/paper/figs dir
 MODELS = f"{ROOT}/site/src/routes/models"  # directory to write model analysis
 # whether a currently running slurm job is in debug mode
 DEBUG = "DEBUG" in os.environ or (

diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -181,7 +181,7 @@ def glob_to_df(
     return pd.concat(sub_dfs.values())
 
 
-def load_df_wbm_with_preds(
+def load_df_wbm_preds(
     models: Sequence[str],
     pbar: bool = True,
     id_col: str = "material_id",