Skip to content

Commit

Permalink
add 2023-01-25-rolling-mae-vs-hull-dist-models.svelte
Browse files Browse the repository at this point in the history
fix color of SEM shaded area and rolling window annotation in rolling_mae_vs_hull_dist() for backend=plotly
add site/src/figs/2023-01-26-wbm-each-hist.svelte to wbm/readme.md

implement rolling_acc for backend=plotly branch of hist_classified_stable_vs_hull_dist()
  • Loading branch information
janosh committed Jun 20, 2023
1 parent 2f795f7 commit 9fed210
Show file tree
Hide file tree
Showing 26 changed files with 344 additions and 155 deletions.
2 changes: 1 addition & 1 deletion data/mp/build_phase_diagram.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,4 @@
xlabel="MP Formation Energy (eV/atom)",
ylabel="Our Formation Energy (eV/atom)",
)
ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.png", dpi=300)
ax.figure.savefig(f"{ROOT}/tmp/{today}-our-vs-mp-formation-energies.webp", dpi=300)
4 changes: 2 additions & 2 deletions data/mp/get_mp_energies.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
annotate_mae_r2(df.formation_energy_per_atom, df.decomposition_enthalpy)
# result on 2023-01-10: plots match. no correlation between formation energy and decomposition
# enthalpy. R^2 = -1.571, MAE = 1.604
# ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.png", dpi=300)
# ax.figure.savefig(f"{module_dir}/{today}-mp-decomp-enth-vs-e-form.webp", dpi=300)


# %% scatter plot energy above convex hull vs decomposition enthalpy
Expand All @@ -99,4 +99,4 @@
title=f"{n_above_line:,} / {len(df):,} = {n_above_line/len(df):.1%} "
"MP materials with\nenergy_above_hull - decomposition_enthalpy.clip(0) > 0.1"
)
# ax.figure.savefig(f"{module_dir}/{today}-mp-e-above-hull-vs-decomp-enth.png", dpi=300)
# ax.figure.savefig(f"{module_dir}/{today}-mp-e-above-hull-vs-decomp-enth.webp", dpi=300)
60 changes: 58 additions & 2 deletions data/wbm/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@

from matbench_discovery import FIGS, today
from matbench_discovery.data import df_wbm

module_dir = os.path.dirname(__file__)
from matbench_discovery.plots import pio

"""
Compare MP and WBM elemental prevalence. Starting with WBM, MP below.
"""

module_dir = os.path.dirname(__file__)
print(f"{pio.templates.default=}")


# %%
wbm_elem_counts = count_elements(df_wbm.formula).astype(int)
Expand Down Expand Up @@ -81,3 +83,57 @@
# %%
mp_fig.write_image(f"{module_dir}/figs/{today}-mp-elements.svg", width=1000, height=500)
# save_fig(mp_fig, f"{FIGS}/{today}-mp-elements.svelte")


# %% histogram of energy above MP convex hull for WBM
col = "e_above_hull_mp2020_corrected_ppd_mp"
# col = "e_form_per_atom_mp2020_corrected"
mean, std = df_wbm[col].mean(), df_wbm[col].std()

fig = df_wbm[col].hist(
bins=100,
backend="plotly",
range_x=[mean - 2 * std, mean + 2 * std],
template="plotly_dark",
)

if col.startswith("e_above_hull"):
n_stable = sum(df_wbm[col] <= 0)
n_unstable = sum(df_wbm[col] > 0)
assert n_stable + n_unstable == len(df_wbm.dropna())

dummy_mae = (df_wbm[col] - df_wbm[col].mean()).abs().mean()

title = (
f"n={len(df_wbm.dropna()):,} with {n_stable:,} stable + {n_unstable:,} "
f"unstable, dummy MAE={dummy_mae:.2f}"
)
fig.update_layout(title=dict(text=title, x=0.5, y=0.95))

fig.update_layout(showlegend=False, paper_bgcolor="rgba(0,0,0,0)")
fig.update_xaxes(title_text="WBM energy above MP convex hull (eV/atom)")


for x_pos, label in zip(
[mean, mean + std, mean - std],
[f"{mean = :.2f}", f"{mean + std = :.2f}", f"{mean - std = :.2f}"],
):
fig.add_vline(x=x_pos, line=dict(width=1, dash="dash"))
fig.add_annotation(
x=x_pos,
y=0.95,
text=label,
showarrow=False,
yref="paper",
xanchor="left",
xshift=5,
)
fig.show()


# subsample x
for trace in fig.data:
trace.x = trace.x[::8]

save_fig(fig, f"{FIGS}/{today}-wbm-each-hist.svelte")
save_fig(fig, f"./figs/{today}-wbm-each-hist.svg", width=1000, height=500)
2 changes: 1 addition & 1 deletion data/wbm/fetch_process_wbm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
xlabel="legacy corrections (eV / atom)",
ylabel="MP2020 corrections (eV / atom)",
)
# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.png")
# ax.figure.savefig(f"{ROOT}/tmp/{today}-legacy-vs-mp2020-corrections.webp")


# %% Python crashes with segfault on correcting the energy of wbm-1-24459 due to
Expand Down
26 changes: 19 additions & 7 deletions data/wbm/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ The number of materials in each step before and after processing are:
| before | 61,848 | 52,800 | 79,205 | 40,328 | 23,308 | 257,487 |
| after | 61,466 | 52,755 | 79,160 | 40,314 | 23,268 | 256,963 |

## 🔗 &thinsp; Links to raw WBM Data Files
## 🔗 &thinsp; Links to WBM Files

Links to WBM data files have proliferated. This is an attempt to keep track of them.
Links to raw WBM data files have proliferated. This is an attempt to keep track of them.

Initial structures (after element substitution but before DFT relaxation) were sent as Google Drive links via email by Hai-Chen Wang on 2021-09-01.

Expand All @@ -72,18 +72,30 @@ materialscloud:2021.68 includes a readme file with a description of the dataset,

[wbm paper]: https://nature.com/articles/s41524-020-00481-6

## 📊 &thinsp; Chemical Diversity
## 🧪 &thinsp; Chemical Diversity

Both the WBM test set and even more so the MP training set are heavily oxide dominated. The WBM test set is about 75% larger than the MP training set and also more chemically diverse, containing a higher fraction of transition metals, post-transition metals and metalloids. Our goal in picking such a large diverse test set is future-proofing. Ideally, this data will provide a challenging materials discovery test bed even for large foundational ML models in the future.
The WBM test set and even more so the MP training set are heavily oxide dominated. The WBM test set is about 75% larger than the MP training set and also more chemically diverse, containing a higher fraction of transition metals, post-transition metals and metalloids. Our goal in picking such a large diverse test set is future-proofing. Ideally, this data will provide a challenging materials discovery test bed even for large foundational ML models in the future.

Element counts for WBM test set consisting of 256,963 WBM `ComputedStructureEntries`

<slot name="wbm-elements-heatmap">
<img src="./figs/2023-01-08-wbm-elements.svg" alt="Periodic table log heatmap of WBM elements">
</slot>
<caption>Element counts for test set consisting of 256,963 WBM <code>ComputedStructureEntries</code></caption>

By comparison, the training set of MP ComputedStructureEntries has this element distribution.
Element counts for MP training set consisting of 146,323 `ComputedStructureEntries`

<slot name="mp-elements-heatmap">
<img src="./figs/2023-01-08-mp-elements.svg" alt="Periodic table log heatmap of MP elements">
</slot>
<caption>Element counts for training set consisting of 146,323 MP <code>ComputedStructureEntries</code></caption>

## 🎯 &thinsp; Target Distribution

The WBM test set has an energy above the MP convex hull distribution with mean **0.02 eV/atom** and standard deviation of **0.25 eV/atom**.

The dummy MAE of always predicting the test set mean is **0.17 eV/atom**.

The number of stable materials is **97k** out of 257k, resulting in a dummy stability hit rate of **37%**.

<slot name="wbm-each-hist">
<img src="./figs/2023-01-26-wbm-each-hist.svg" alt="WBM energy above MP convex hull distribution">
</slot>
5 changes: 3 additions & 2 deletions matbench_discovery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@
import sys
from datetime import datetime

ROOT = os.path.dirname(os.path.dirname(__file__)) # repository root
ROOT = os.path.dirname(os.path.dirname(__file__)) # repo root
FIGS = f"{ROOT}/site/src/figs" # directory to store interactive figures
STATIC = f"{ROOT}/site/static/figs" # directory to store static figures
STATIC = f"{ROOT}/site/static/figs" # directory to store static figures, is symlinked
# into site/src/routes/paper/figs dir
MODELS = f"{ROOT}/site/src/routes/models" # directory to write model analysis
# whether a currently running slurm job is in debug mode
DEBUG = "DEBUG" in os.environ or (
Expand Down
2 changes: 1 addition & 1 deletion matbench_discovery/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def glob_to_df(
return pd.concat(sub_dfs.values())


def load_df_wbm_with_preds(
def load_df_wbm_preds(
models: Sequence[str],
pbar: bool = True,
id_col: str = "material_id",
Expand Down
Loading

0 comments on commit 9fed210

Please sign in to comment.