Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SOC & multiple PROCAR parsing functionalities #3890

Merged
merged 16 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 261 additions & 29 deletions src/pymatgen/io/vasp/outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from monty.os.path import zpath
from monty.re import regrep
from numpy.testing import assert_allclose
from tqdm import tqdm

from pymatgen.core import Composition, Element, Lattice, Structure
from pymatgen.core.trajectory import Trajectory
Expand Down Expand Up @@ -3781,10 +3782,15 @@ def get_alpha(self) -> VolumetricData:
return VolumetricData(self.structure, alpha_data)


class Procar:
class Procar(MSONable):
"""
PROCAR file reader.

Updated to use code from easyunfold (https://smtg-bham.github.io/easyunfold; band-structure
unfolding package) to allow SOC PROCAR parsing, and parsing multiple PROCAR files together.
easyunfold's PROCAR parser can be used if finer control over projections (k-point weighting,
normalisation per band, quick orbital sub-selection etc) is needed.

Attributes:
data (dict): The PROCAR data of the form below. It should VASP uses 1-based indexing,
but all indices are converted to 0-based here.
Expand All @@ -3795,59 +3801,259 @@ class Procar:
nbands (int): Number of bands.
nkpoints (int): Number of k-points.
nions (int): Number of ions.
nspins (int): Number of spins.
is_soc (bool): Whether the PROCAR contains spin-orbit coupling (LSORBIT = True) data.
kpoints (np.array): The k-points as an nd.array of shape (nkpoints, 3).
occupancies (dict): The occupancies of the bands as a dict of the form:
{ spin: nd.array accessed with (k-point index, band index) }
eigenvalues (dict): The eigenvalues of the bands as a dict of the form:
{ spin: nd.array accessed with (k-point index, band index) }
xyz_data (dict): The PROCAR projections data along the x,y and z magnetisation projection
directions, with is_soc = True (see VASP wiki for more info).
{ 'x'/'y'/'z': nd.array accessed with (k-point index, band index, ion index, orbital index) }
"""

def __init__(self, filename: PathLike) -> None:
def __init__(self, filename: PathLike | list[PathLike]):
"""
Args:
filename: The PROCAR to read.
filename: The path to PROCAR(.gz) file to read, or list of paths.
"""
headers = None
# get PROCAR filenames list to parse:
filenames = filename if isinstance(filename, list) else [filename]
self.nions: int | None = None # used to check for consistency in files later
self.nspins: int | None = None # used to check for consistency in files later
self.is_soc: bool | None = None # used to check for consistency in files later
self.orbitals = None # used to check for consistency in files later
self.read(filenames)

def read(self, filenames: list[PathLike]):
"""
Read in PROCAR projections data, possibly from multiple files.

Args:
filenames: List of PROCAR files to read.
"""
parsed_kpoints = None
occupancies_list, kpoints_list, weights_list = [], [], []
eigenvalues_list, data_list, xyz_data_list = [], [], []
phase_factors_list = []
for filename in tqdm(filenames, desc="Reading PROCARs", unit="file", disable=len(filenames) == 1):
kpoints, weights, eigenvalues, occupancies, data, phase_factors, xyz_data = self._read(
filename, parsed_kpoints=parsed_kpoints
)

# Append to respective lists
occupancies_list.append(occupancies)
kpoints_list.append(kpoints)
weights_list.append(weights)
eigenvalues_list.append(eigenvalues)
data_list.append(data)
xyz_data_list.append(xyz_data)
phase_factors_list.append(phase_factors)

# Combine arrays along the kpoints axis:
# nbands (axis = 2) could differ between arrays, so set missing values to zero:
max_nbands = max(eig_dict[Spin.up].shape[1] for eig_dict in eigenvalues_list)
for dict_array in itertools.chain(
occupancies_list, eigenvalues_list, data_list, xyz_data_list, phase_factors_list
):
if dict_array:
for key, array in dict_array.items():
if array.shape[1] < max_nbands:
if len(array.shape) == 2: # occupancies, eigenvalues
dict_array[key] = np.pad(
array,
((0, 0), (0, max_nbands - array.shape[2])),
mode="constant",
)
elif len(array.shape) == 4: # data, phase_factors
dict_array[key] = np.pad(
array,
(
(0, 0),
(0, max_nbands - array.shape[2]),
(0, 0),
(0, 0),
),
mode="constant",
)
elif len(array.shape) == 5: # xyz_data
dict_array[key] = np.pad(
array,
(
(0, 0),
(0, max_nbands - array.shape[2]),
(0, 0),
(0, 0),
(0, 0),
),
mode="constant",
)
else:
raise ValueError("Unexpected array shape encountered!")

# set nbands, nkpoints, and other attributes:
self.nbands = max_nbands
self.kpoints = np.concatenate(kpoints_list, axis=0)
self.nkpoints = len(self.kpoints)
self.occupancies = {
spin: np.concatenate([occupancies[spin] for occupancies in occupancies_list], axis=0)
for spin in occupancies_list[0]
}
self.eigenvalues = {
spin: np.concatenate([eigenvalues[spin] for eigenvalues in eigenvalues_list], axis=0)
for spin in eigenvalues_list[0]
}
self.weights = np.concatenate(weights_list, axis=0)
self.data = {spin: np.concatenate([data[spin] for data in data_list], axis=0) for spin in data_list[0]}
self.phase_factors = {
spin: np.concatenate([phase_factors[spin] for phase_factors in phase_factors_list], axis=0)
for spin in phase_factors_list[0]
}
if self.is_soc:
self.xyz_data: dict | None = {
key: np.concatenate([xyz_data[key] for xyz_data in xyz_data_list], axis=0) for key in xyz_data_list[0]
}
else:
self.xyz_data = None

def _parse_kpoint_line(self, line):
"""
Parse k-point vector from a PROCAR line.

Sometimes VASP outputs the kpoints joined together like
'0.00000000-0.50000000-0.50000000' when there are negative signs,
so need to be able to recognise and handle this.
"""
fields = line.split()
kpoint_fields = fields[3 : fields.index("weight")]
kpoint_fields = [" -".join(field.split("-")).split() for field in kpoint_fields]
kpoint_fields = [val for sublist in kpoint_fields for val in sublist] # flatten

return tuple(round(float(val), 5) for val in kpoint_fields) # tuple to make it hashable,
# rounded to 5 decimal places to ensure proper kpoint matching

def _read(self, filename: PathLike, parsed_kpoints=None):
"""
Main function for reading in the PROCAR projections data.

Args:
filename: Path to PROCAR file to read.
parsed_kpoints:
set of tuples of already-parsed kpoints (e.g. from multiple zero-weighted bandstructure
calculations), to ensure redundant/duplicate parsing.
"""
if parsed_kpoints is None:
parsed_kpoints = set()

with zopen(filename, mode="rt") as file_handle:
preamble_expr = re.compile(r"# of k-points:\s*(\d+)\s+# of bands:\s*(\d+)\s+# of ions:\s*(\d+)")
kpoint_expr = re.compile(r"^k-point\s+(\d+).*weight = ([0-9\.]+)")
band_expr = re.compile(r"^band\s+(\d+)")
ion_expr = re.compile(r"^ion.*")
total_expr = re.compile(r"^tot.*")
expr = re.compile(r"^([0-9]+)\s+")
current_kpoint = 0
current_band = 0
done = False
spin = Spin.down
spin = Spin.down # switched to Spin.up for first block

n_kpoints = None
kpoints: list[tuple[float, float, float]] = []
n_bands = None
n_ions = None
weights: list[float] = []
weights: np.ndarray[float] | None = None
headers = None
data: dict[Spin, np.ndarray] | None = None
data: dict[Spin, np.ndarray] = {}
eigenvalues: dict[Spin, np.ndarray] | None = None
occupancies: dict[Spin, np.ndarray] | None = None
phase_factors: dict[Spin, np.ndarray] | None = None
xyz_data: dict[str, np.ndarray] | None = None # 'x'/'y'/'z' as keys for SOC projections dict
# keep track of parsed kpoints, to avoid redundant/duplicate parsing with multiple PROCARs:
this_procar_parsed_kpoints = (
set()
) # set of tuples of parsed (kvectors, 0/1 for Spin.up/down) for this PROCAR

# first dynamically determine whether PROCAR is SOC or not; SOC PROCARs have 4 lists of projections (
# total and x,y,z) for each band, while non-SOC have only 1 list of projections:
tot_count = 0
band_count = 0
for line in file_handle:
if total_expr.match(line):
tot_count += 1
elif band_expr.match(line):
band_count += 1
if band_count == 2:
break

file_handle.seek(0) # reset file handle to beginning
if tot_count == 1:
is_soc = False
elif tot_count == 4:
is_soc = True
else:
raise ValueError(
"Number of lines starting with 'tot' in PROCAR does not match expected values (4x or 1x number of "
"lines with 'band'), indicating a corrupted file!"
)
if self.is_soc is not None and self.is_soc != is_soc:
raise ValueError("Mismatch in SOC setting (LSORBIT) in supplied PROCARs!")
self.is_soc = is_soc

skipping_kpoint = False # true when skipping projections for a previously-parsed kpoint
ion_line_count = 0 # printed twice when phase factors present
proj_data_parsed_for_band = 0 # 0 for non-SOC, 1-4 for SOC/phase factors
for line in file_handle:
line = line.strip()
if band_expr.match(line):
match = band_expr.match(line)
current_band = int(match[1]) - 1 # type: ignore[index]
done = False
if ion_expr.match(line):
ion_line_count += 1

elif kpoint_expr.match(line):
if kpoint_expr.match(line):
kvec = self._parse_kpoint_line(line)
match = kpoint_expr.match(line)
current_kpoint = int(match[1]) - 1 # type: ignore[index]
weights[current_kpoint] = float(match[2]) # type: ignore[index]
if current_kpoint == 0:
spin = Spin.up if spin == Spin.down else Spin.down
done = False

if (
kvec not in parsed_kpoints
and (kvec, {Spin.down: 0, Spin.up: 1}[spin]) not in this_procar_parsed_kpoints
):
this_procar_parsed_kpoints.add((kvec, {Spin.down: 0, Spin.up: 1}[spin]))
skipping_kpoint = False
if spin == Spin.up:
kpoints.append(kvec) # only add once
else: # skip ahead to next kpoint:
skipping_kpoint = True
continue

if spin == Spin.up: # record k-weight only once
weights[current_kpoint] = float(match[2]) # type: ignore[index]
proj_data_parsed_for_band = 0

elif skipping_kpoint:
continue

elif band_expr.match(line):
ion_line_count = 0 # printed a second time when phase factors present
match = band_expr.match(line)
current_band = int(match[1]) - 1 # type: ignore[index]
tokens = line.split()
eigenvalues[spin][current_kpoint, current_band] = float(tokens[4]) # type: ignore[index]
occupancies[spin][current_kpoint, current_band] = float(tokens[-1]) # type: ignore[index]
# keep track of parsed projections for each band (1x w/non-SOC, 4x w/SOC):
proj_data_parsed_for_band = 0

elif headers is None and ion_expr.match(line):
headers = line.split()
headers.pop(0)
headers.pop(-1)

data = defaultdict(lambda: np.zeros((n_kpoints, n_bands, n_ions, len(headers))))

phase_factors = defaultdict(
lambda: np.full((n_kpoints, n_bands, n_ions, len(headers)), np.nan, dtype=np.complex128)
)
if self.is_soc: # dict keys are now "x", "y", "z" rather than Spin.up/down
xyz_data = defaultdict(lambda: np.zeros((n_kpoints, n_bands, n_ions, len(headers))))

elif expr.match(line):
tokens = line.split()
Expand All @@ -3856,11 +4062,15 @@ def __init__(self, filename: PathLike) -> None:
num_data = np.array([float(t) for t in tokens[: len(headers)]])
assert phase_factors is not None

if not done:
assert data is not None
if proj_data_parsed_for_band == 0:
data[spin][current_kpoint, current_band, index, :] = num_data

elif len(tokens) > len(headers):
elif self.is_soc and proj_data_parsed_for_band < 4:
proj_direction = {1: "x", 2: "y", 3: "z"}[proj_data_parsed_for_band]
assert xyz_data is not None
xyz_data[proj_direction][current_kpoint, current_band, index, :] = num_data

elif len(tokens) > len(headers): # note no xyz projected phase factors with SOC
# New format of PROCAR (VASP 5.4.4)
num_data = np.array([float(t) for t in tokens[: 2 * len(headers)]])
for orb in range(len(headers)):
Expand All @@ -3873,24 +4083,45 @@ def __init__(self, filename: PathLike) -> None:
else:
phase_factors[spin][current_kpoint, current_band, index, :] += 1j * num_data

elif line.startswith("tot"):
done = True
elif total_expr.match(line):
proj_data_parsed_for_band += 1

elif preamble_expr.match(line):
match = preamble_expr.match(line)
assert match is not None
n_kpoints = int(match[1])
n_bands = int(match[2])
if eigenvalues is None: # first spin
weights = np.zeros(n_kpoints)
eigenvalues = defaultdict(lambda: np.zeros((n_kpoints, n_bands)))
occupancies = defaultdict(lambda: np.zeros((n_kpoints, n_bands)))
n_ions = int(match[3])
weights = np.zeros(n_kpoints)

self.nkpoints = n_kpoints
self.nbands = n_bands
self.nions = n_ions
self.weights = weights
if self.nions is not None and self.nions != n_ions: # parsing multiple PROCARs but nions mismatch!
raise ValueError(f"Mismatch in number of ions in supplied PROCARs: ({n_ions} vs {self.nions})!")

self.nions = n_ions # attributes that should be consistent between multiple files are set here
if self.orbitals is not None and self.orbitals != headers: # multiple PROCARs but orbitals mismatch!
raise ValueError(f"Mismatch in orbital headers in supplied PROCARs: {headers} vs {self.orbitals}!")
self.orbitals = headers
self.data = data
self.phase_factors = phase_factors
if self.nspins is not None and self.nspins != len(data): # parsing multiple PROCARs but nspins mismatch!
raise ValueError("Mismatch in number of spin channels in supplied PROCARs!")
self.nspins = len(data)

# chop off empty kpoints in arrays and redetermine nkpoints as we may have skipped previously-parsed kpoints
nkpoints = current_kpoint + 1
weights = np.array(weights[:nkpoints]) # type: ignore[index]
data = {spin: data[spin][:nkpoints] for spin in data} # type: ignore[index]
eigenvalues = {spin: eigenvalues[spin][:nkpoints] for spin in eigenvalues} # type: ignore[union-attr,index]
occupancies = {spin: occupancies[spin][:nkpoints] for spin in occupancies} # type: ignore[union-attr,index]
phase_factors = {spin: phase_factors[spin][:nkpoints] for spin in phase_factors} # type: ignore[union-attr,index]
if self.is_soc:
xyz_data = {spin: xyz_data[spin][:nkpoints] for spin in xyz_data} # type: ignore[union-attr,index]

# Update the parsed kpoints
parsed_kpoints.update({kvec_spin_tuple[0] for kvec_spin_tuple in this_procar_parsed_kpoints})

return kpoints, weights, eigenvalues, occupancies, data, phase_factors, xyz_data

def get_projection_on_elements(self, structure: Structure) -> dict[Spin, list[list[dict[str, float]]]]:
"""Get a dict of projections on elements.
Expand Down Expand Up @@ -3919,7 +4150,8 @@ def get_projection_on_elements(self, structure: Structure) -> dict[Spin, list[li
return elem_proj

def get_occupation(self, atom_index: int, orbital: str) -> dict:
"""Get the occupation for a particular orbital of a particular atom.
"""
Get the occupation for a particular orbital of a particular atom.

Args:
atom_num (int): Index of atom in the PROCAR. It should be noted
Expand Down
Binary file added tests/files/io/vasp/outputs/PROCAR.SOC.gz
Binary file not shown.
Binary file added tests/files/io/vasp/outputs/PROCAR.split1.gz
Binary file not shown.
Binary file added tests/files/io/vasp/outputs/PROCAR.split2.gz
Binary file not shown.
Loading