Skip to content

Commit

Permalink
Improve unphysical (greater than 1) occupancy handling in CifParser
Browse files Browse the repository at this point in the history
… and add missing site label `if not check_occu` (#3819)

* fix tol def and add error msg

* no verify

* allow any occupancy when not check_occu

* remove unnecessary warning

* simplify warning message

* add one more unit test

* fix var name in comment

* revise docstring

* fix missing label when `check_occu`

* update unit test

* revert accidental change during merge

* revert change to `occupancy_tolerance`

* fix comment and revert a unnecessary deletion

* filter valid occupancy

* revert docstring change
  • Loading branch information
DanielYang59 authored May 30, 2024
1 parent 0c9ff40 commit a772ddb
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 11 deletions.
20 changes: 11 additions & 9 deletions pymatgen/io/cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,9 +323,9 @@ def __init__(
"""
Args:
filename (PathLike): CIF file, gzipped or bzipped CIF files are fine too.
occupancy_tolerance (float): If total occupancy of a site is between 1 and occupancy_tolerance, the
occupancies will be scaled down to 1.
site_tolerance (float): This tolerance is used to determine if two sites are in the same position,
occupancy_tolerance (float): If total occupancy of a site is between
1 and occupancy_tolerance, it will be scaled down to 1.
site_tolerance (float): This tolerance is used to determine if two sites are at the same position,
in which case they will be combined to a single disordered site. Defaults to 1e-4.
frac_tolerance (float): This tolerance is used to determine is a coordinate should be rounded to an ideal
value. e.g. 0.6667 is rounded to 2/3. This is desired if symmetry operations are going to be applied.
Expand Down Expand Up @@ -1027,11 +1027,11 @@ def get_matching_coord(

# Get occupancy
try:
occu = str2float(data["_atom_site_occupancy"][idx])
occu: float = str2float(data["_atom_site_occupancy"][idx])
except (KeyError, ValueError):
occu = 1

# If check_occu is True or the occupancy is greater than 0, create comp_d
# If don't check_occu or the occupancy is greater than 0, create comp_dict
if not check_occu or occu > 0:
# Create site coordinate
coord: Vector3D = (
Expand Down Expand Up @@ -1073,7 +1073,7 @@ def get_matching_coord(

if any(occu > 1 for occu in _sum_occupancies):
msg = (
f"Some occupancies ({_sum_occupancies}) sum to > 1! If they are within "
f"Some occupancies ({filter(lambda x: x<=1, _sum_occupancies)}) sum to > 1! If they are within "

This comment has been minimized.

Copy link
@janosh

janosh Jun 1, 2024

Member

@shyuep did you actually look at the code in this PR? this should not have been merged as is. x<=1 should be x>1 and filter returns a generator which prints like this

f"{(i for i in range(3))}"
>>> '<generator object <genexpr> at 0x110ffe670>'

it's good to ask for test cases for error messages and warnings

cc @DanielYang59

This comment has been minimized.

Copy link
@DanielYang59

DanielYang59 Jun 1, 2024

Author Contributor

Oh. I apologies for this, and I would push a PR to fix this ASAP

"the occupancy_tolerance, they will be rescaled. "
f"The current occupancy_tolerance is set to: {self._occupancy_tolerance}"
)
Expand Down Expand Up @@ -1149,7 +1149,10 @@ def get_matching_coord(
all_species_noedit = all_species.copy() # save copy before scaling in case of check_occu=False, used below
for idx, species in enumerate(all_species):
total_occu = sum(species.values())
if 1 < total_occu <= self._occupancy_tolerance:
if check_occu and total_occu > self._occupancy_tolerance:
raise ValueError(f"Occupancy {total_occu} exceeded tolerance.")

if total_occu > 1:
all_species[idx] = species / total_occu

if all_species and len(all_species) == len(all_coords) and len(all_species) == len(all_magmoms):
Expand Down Expand Up @@ -1198,6 +1201,7 @@ def get_matching_coord(
all_coords[idx],
lattice,
properties=site_properties,
label=all_labels[idx],
skip_checks=True,
)

Expand Down Expand Up @@ -1278,8 +1282,6 @@ def parse_structures(
"in the CIF file as is. If you want the primitive cell, please set primitive=True explicitly.",
UserWarning,
)
if not check_occu: # added in https://github.com/materialsproject/pymatgen/pull/2836
warnings.warn("Structures with unphysical site occupancies are not compatible with many pymatgen features.")

if primitive and symmetrized:
raise ValueError(
Expand Down
15 changes: 13 additions & 2 deletions tests/io/test_cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,17 +731,28 @@ def test_empty(self):
cb2 = CifBlock.from_str(str(cb))
assert cb == cb2

def test_bad_cif(self):
def test_bad_occu(self):
filepath = f"{TEST_FILES_DIR}/cif/bad_occu.cif"
parser = CifParser(filepath)
with pytest.raises(
ValueError, match="No structure parsed for section 1 in CIF.\nSpecies occupancies sum to more than 1!"
ValueError, match="No structure parsed for section 1 in CIF.\nOccupancy 1.556 exceeded tolerance."
):
parser.parse_structures(on_error="raise")
parser = CifParser(filepath, occupancy_tolerance=2)
struct = parser.parse_structures()[0]
assert struct[0].species["Al3+"] == approx(0.778)

def test_not_check_occu(self):
# Test large occupancy with check_occu turned off
with open(f"{TEST_FILES_DIR}/cif/site_type_symbol_test.cif") as cif_file:
cif_str = cif_file.read()
cif_str = cif_str.replace("Te Te 1.0000", "Te_label Te 10.0", 1)

structs = CifParser.from_str(cif_str).parse_structures(check_occu=False)

assert len(structs) > 0
assert set(structs[0].labels) == {"Te_label", "Ge"}

def test_one_line_symm(self):
cif_file = f"{TEST_FILES_DIR}/cif/OneLineSymmP1.cif"
parser = CifParser(cif_file)
Expand Down

0 comments on commit a772ddb

Please sign in to comment.