Merge pull request #2562 from sudarshanv01/qchem_io_eigenvals

Parsing the Fock matrix and eigenvalues from the QChem output file
materialsproject · Nov 1, 2022 · 03691a8 · 03691a8
2 parents 553dc23 + 15807db
commit 03691a8
Show file tree

Hide file tree

Showing 7 changed files with 19,657 additions and 6 deletions.
diff --git a/pymatgen/io/qchem/outputs.py b/pymatgen/io/qchem/outputs.py
@@ -26,16 +26,16 @@
 from pymatgen.core import Molecule
 from pymatgen.io.qchem.utils import (
     process_parsed_coords,
+    process_parsed_fock_matrix,
+    read_matrix_pattern,
     read_pattern,
     read_table_pattern,
 )
 
 try:
-
-    have_babel = True
+    from openbabel import openbabel
 except ImportError:
     openbabel = None
-    have_babel = False
 
 
 __author__ = "Samuel Blau, Brandon Wood, Shyam Dwaraknath, Evan Spotte-Smith"
@@ -111,6 +111,17 @@ def __init__(self, filename: str):
             terminate_on_match=True,
         ).get("key")
 
+        # Get the value of scf_final_print in the output file
+        scf_final_print = read_pattern(
+            self.text,
+            {"key": r"scf_final_print\s*=\s*(\d+)"},
+            terminate_on_match=True,
+        ).get("key")
+        if scf_final_print is not None:
+            self.data["scf_final_print"] = int(scf_final_print[0][0])
+        else:
+            self.data["scf_final_print"] = 0
+
         # Check if calculation uses GEN_SCFMAN, multiple potential output formats
         self.data["using_GEN_SCFMAN"] = read_pattern(
             self.text,
@@ -337,6 +348,15 @@ def __init__(self, filename: str):
         if self.data.get("force_job", []):
             self._read_force_data()
 
+        # Read in the eigenvalues from the output file
+        if self.data["scf_final_print"] >= 1:
+            self._read_eigenvalues()
+
+        # Read the Fock matrix from the output file
+        if self.data["scf_final_print"] >= 3:
+            self._read_fock_matrix()
+            self._read_coefficient_matrix()
+
         # Check if the calculation is a PES scan. If so, parse the relevant output
         self.data["scan_job"] = read_pattern(
             self.text, {"key": r"(?i)\s*job(?:_)*type\s*(?:=)*\s*pes_scan"}, terminate_on_match=True
@@ -379,6 +399,120 @@ def multiple_outputs_from_file(cls, filename, keep_sub_files=True):
                 os.remove(filename + "." + str(i))
         return to_return
 
+    def _read_eigenvalues(self):
+        """Parse the orbital energies from the output file. An array of the
+        dimensions of the number of orbitals used in the calculation is stored."""
+
+        # Find the pattern corresponding to the "Final Alpha MO Eigenvalues" section
+        header_pattern = r"Final Alpha MO Eigenvalues"
+        # The elements of the matrix are always floats, they are surrounded by
+        # the index of the matrix, which are integers.
+        elements_pattern = r"\-*\d+\.\d+"
+        if not self.data.get("unrestricted", []):
+            spin_unrestricted = False
+            # Since this is a spin-paired calculation, there will be
+            # only a single list of orbital energies (denoted as alpha)
+            # in the input file. The footer will then correspond
+            # to the next section, which is the MO coefficients.
+            footer_pattern = r"Final Alpha MO Coefficients+\s*"
+        else:
+            spin_unrestricted = True
+            # It is an unrestricted calculation, so there will be two sets
+            # of eigenvalues. First parse the alpha eigenvalues.
+            footer_pattern = r"Final Beta MO Eigenvalues"
+        # Common for both spin restricted and unrestricted calculations is the alpha eigenvalues.
+        alpha_eigenvalues = read_matrix_pattern(
+            header_pattern, footer_pattern, elements_pattern, self.text, postprocess=float
+        )
+        # The beta eigenvalues are only present if this is a spin-unrestricted calculation.
+        if spin_unrestricted:
+            header_pattern = r"Final Beta MO Eigenvalues"
+            footer_pattern = r"Final Alpha MO Coefficients+\s*"
+            beta_eigenvalues = read_matrix_pattern(
+                header_pattern, footer_pattern, elements_pattern, self.text, postprocess=float
+            )
+
+        self.data["alpha_eigenvalues"] = alpha_eigenvalues
+
+        if spin_unrestricted:
+            self.data["beta_eigenvalues"] = beta_eigenvalues
+
+    def _read_fock_matrix(self):
+        """Parses the Fock matrix. The matrix is read in whole
+        from the output file and then transformed into the right dimensions."""
+
+        # The header is the same for both spin-restricted and spin-unrestricted calculations.
+        header_pattern = r"Final Alpha Fock Matrix"
+        # The elements of the matrix are always floats, they are surrounded by
+        # the index of the matrix, which are integers
+        elements_pattern = r"\-*\d+\.\d+"
+        if not self.data.get("unrestricted", []):
+            spin_unrestricted = False
+            footer_pattern = "SCF time:"
+        else:
+            spin_unrestricted = True
+            footer_pattern = "Final Beta Fock Matrix"
+        # Common for both spin restricted and unrestricted calculations is the alpha Fock matrix.
+        alpha_fock_matrix = read_matrix_pattern(
+            header_pattern, footer_pattern, elements_pattern, self.text, postprocess=float
+        )
+        # The beta Fock matrix is only present if this is a spin-unrestricted calculation.
+        if spin_unrestricted:
+            header_pattern = r"Final Beta Fock Matrix"
+            footer_pattern = "SCF time:"
+            beta_fock_matrix = read_matrix_pattern(
+                header_pattern, footer_pattern, elements_pattern, self.text, postprocess=float
+            )
+
+        # Convert the matrices to the right dimension. Right now they are simply
+        # one massive list of numbers, but we need to split them into a matrix. The
+        # Fock matrix must always be a square matrix and as a result, we have to
+        # modify the dimensions.
+        alpha_fock_matrix = process_parsed_fock_matrix(alpha_fock_matrix)
+        self.data["alpha_fock_matrix"] = alpha_fock_matrix
+
+        if spin_unrestricted:
+            # Perform the same transformation for the beta Fock matrix.
+            beta_fock_matrix = process_parsed_fock_matrix(beta_fock_matrix)
+            self.data["beta_fock_matrix"] = beta_fock_matrix
+
+    def _read_coefficient_matrix(self):
+        """Parses the coefficient matrix from the output file. Done is much
+        the same was as the Fock matrix."""
+        # The header is the same for both spin-restricted and spin-unrestricted calculations.
+        header_pattern = r"Final Alpha MO Coefficients"
+        # The elements of the matrix are always floats, they are surrounded by
+        # the index of the matrix, which are integers
+        elements_pattern = r"\-*\d+\.\d+"
+        if not self.data.get("unrestricted", []):
+            spin_unrestricted = False
+            footer_pattern = "Final Alpha Density Matrix"
+        else:
+            spin_unrestricted = True
+            footer_pattern = "Final Beta MO Coefficients"
+        # Common for both spin restricted and unrestricted calculations is the alpha Fock matrix.
+        alpha_coeff_matrix = read_matrix_pattern(
+            header_pattern, footer_pattern, elements_pattern, self.text, postprocess=float
+        )
+        if spin_unrestricted:
+            header_pattern = r"Final Beta MO Coefficients"
+            footer_pattern = "Final Alpha Density Matrix"
+            beta_coeff_matrix = read_matrix_pattern(
+                header_pattern, footer_pattern, elements_pattern, self.text, postprocess=float
+            )
+
+        # Convert the matrices to the right dimension. Right now they are simply
+        # one massive list of numbers, but we need to split them into a matrix. The
+        # Fock matrix must always be a square matrix and as a result, we have to
+        # modify the dimensions.
+        alpha_coeff_matrix = process_parsed_fock_matrix(alpha_coeff_matrix)
+        self.data["alpha_coeff_matrix"] = alpha_coeff_matrix
+
+        if spin_unrestricted:
+            # Perform the same transformation for the beta Fock matrix.
+            beta_coeff_matrix = process_parsed_fock_matrix(beta_coeff_matrix)
+            self.data["beta_coeff_matrix"] = beta_coeff_matrix
+
     def _read_charge_and_multiplicity(self):
         """
         Parses charge and multiplicity.
@@ -893,7 +1027,7 @@ def _read_optimization_data(self):
                 real_energy_trajectory[ii] = float(entry[0])
             self.data["energy_trajectory"] = real_energy_trajectory
             self._read_geometries()
-            if have_babel:
+            if openbabel is not None:
                 self.data["structure_change"] = check_for_structure_changes(
                     self.data["initial_molecule"],
                     self.data["molecule_from_last_geometry"],
@@ -1144,7 +1278,7 @@ def _read_scan_data(self):
             self.data["energy_trajectory"] = real_energy_trajectory
 
         self._read_geometries()
-        if have_babel:
+        if openbabel is not None:
             self.data["structure_change"] = check_for_structure_changes(
                 self.data["initial_molecule"],
                 self.data["molecule_from_last_geometry"],

diff --git a/pymatgen/io/qchem/sets.py b/pymatgen/io/qchem/sets.py
@@ -48,6 +48,7 @@ def __init__(
         new_geom_opt: dict | None = None,
         overwrite_inputs: dict | None = None,
         vdw_mode: Literal["atomic", "sequential"] = "atomic",
+        extra_scf_print: bool = False,
     ):
         """
         Args:
@@ -124,6 +125,9 @@ def __init__(
                 In 'atomic' mode (default), dict keys represent the atomic number associated with each
                 radius (e.g., '12' = carbon). In 'sequential' mode, dict keys represent the sequential
                 position of a single specific atom in the input structure.
+            extra_scf_print (bool): Whether to store extra information generated from the SCF
+                cycle. If switched on, the Fock Matrix, coefficients of MO and the density matrix
+                will be stored.
         """
         self.molecule = molecule
         self.job_type = job_type
@@ -142,6 +146,7 @@ def __init__(
         self.new_geom_opt = new_geom_opt
         self.overwrite_inputs = overwrite_inputs
         self.vdw_mode = vdw_mode
+        self.extra_scf_print = extra_scf_print
 
         pcm_defaults = {
             "heavypoints": "194",
@@ -303,6 +308,17 @@ def __init__(
                     for k, v in temp_opts.items():
                         myopt[k] = v
 
+        if extra_scf_print:
+            # Allow for the printing of the Fock matrix and the eigenvales
+            myrem["scf_final_print"] = "3"
+            # If extra_scf_print is specified, make sure that the convergence of the
+            # SCF cycle is at least 1e-8. Anything less than that might not be appropriate
+            # for printing out the Fock Matrix and coefficients of the MO.
+            if "scf_convergence" not in myrem:
+                myrem["scf_convergence"] = "8"
+            elif int(myrem["scf_convergence"]) < 8:
+                myrem["scf_convergence"] = "8"
+
         super().__init__(
             self.molecule,
             rem=myrem,
@@ -348,6 +364,7 @@ def __init__(
         nbo_params: dict | None = None,
         overwrite_inputs: dict | None = None,
         vdw_mode: Literal["atomic", "sequential"] = "atomic",
+        extra_scf_print: bool = False,
     ):
         """
         Args:
@@ -403,6 +420,9 @@ def __init__(
                 In 'atomic' mode (default), dict keys represent the atomic number associated with each
                 radius (e.g., '12' = carbon). In 'sequential' mode, dict keys represent the sequential
                 position of a single specific atom in the input structure.
+            extra_scf_print (bool): Whether to store extra information generated from the SCF
+                cycle. If switched on, the Fock Matrix, coefficients of MO and the density matrix
+                will be stored.
         """
         self.basis_set = basis_set
         self.scf_algorithm = scf_algorithm
@@ -421,6 +441,7 @@ def __init__(
             nbo_params=nbo_params,
             overwrite_inputs=overwrite_inputs,
             vdw_mode=vdw_mode,
+            extra_scf_print=extra_scf_print,
         )
 
 

diff --git a/pymatgen/io/qchem/tests/single_job.json b/pymatgen/io/qchem/tests/single_job.json
diff --git a/pymatgen/io/qchem/tests/test_outputs.py b/pymatgen/io/qchem/tests/test_outputs.py
@@ -98,6 +98,12 @@
     "ccsd_total_energy",
     "ccsd(t)_correlation_energy",
     "ccsd(t)_total_energy",
+    "alpha_fock_matrix",
+    "beta_fock_matrix",
+    "alpha_eigenvalues",
+    "beta_eigenvalues",
+    "alpha_coeff_matrix",
+    "beta_coeff_matrix",
 }
 
 if have_babel:
@@ -165,6 +171,7 @@
     "new_qchem_files/ts.out",
     "new_qchem_files/ccsd.qout",
     "new_qchem_files/ccsdt.qout",
+    "extra_scf_print.qcout",
 }
 
 multi_job_out_names = {

diff --git a/pymatgen/io/qchem/tests/test_sets.py b/pymatgen/io/qchem/tests/test_sets.py
@@ -324,6 +324,32 @@ def test_init(self):
         self.assertEqual(test_SPSet.solvent, {})
         self.assertEqual(test_SPSet.molecule, test_molecule)
 
+    def test_scf_extra_print(self):
+        test_molecule = QCInput.from_file(os.path.join(test_dir, "new_qchem_files/pcm.qin")).molecule
+        test_SPSet = SinglePointSet(molecule=test_molecule, extra_scf_print=True)
+        self.assertEqual(
+            test_SPSet.rem,
+            {
+                "job_type": "sp",
+                "gen_scfman": "true",
+                "basis": "def2-tzvppd",
+                "max_scf_cycles": "100",
+                "method": "wb97xd",
+                "scf_algorithm": "diis",
+                "xc_grid": "3",
+                "thresh": "14",
+                "s2thresh": "16",
+                "symmetry": "false",
+                "sym_ignore": "true",
+                "resp_charges": "true",
+                "scf_convergence": "8",
+                "scf_final_print": "3",
+            },
+        )
+        self.assertEqual(test_SPSet.pcm, {})
+        self.assertEqual(test_SPSet.solvent, {})
+        self.assertEqual(test_SPSet.molecule, test_molecule)
+
     def test_pcm_init(self):
         test_molecule = QCInput.from_file(os.path.join(test_dir, "new_qchem_files/pcm.qin")).molecule
         test_SPSet = SinglePointSet(molecule=test_molecule, pcm_dielectric=10.0)

diff --git a/pymatgen/io/qchem/utils.py b/pymatgen/io/qchem/utils.py
@@ -39,6 +39,25 @@ def read_pattern(text_str, patterns, terminate_on_match=False, postprocess=str):
     return matches
 
 
+def read_matrix_pattern(header_pattern, footer_pattern, elements_pattern, text, postprocess=str):
+    """Parse a matrix to get the quantities in a numpy array."""
+
+    # Get the piece of text between the header and the footer
+    header_regex = re.compile(header_pattern)
+    footer_regex = re.compile(footer_pattern)
+
+    # Find the text between the header and footer
+    text_between_header_and_footer = text[header_regex.search(text).end() : footer_regex.search(text).start()]
+
+    # Get the elements
+    elements = re.findall(elements_pattern, text_between_header_and_footer)
+
+    # Apply postprocessing to all the elements
+    elements = [postprocess(e) for e in elements]
+
+    return elements
+
+
 def read_table_pattern(
     text_str,
     header_pattern,
@@ -153,3 +172,32 @@ def process_parsed_coords(coords):
         for jj in range(3):
             geometry[ii, jj] = float(entry[jj])
     return geometry
+
+
+def process_parsed_fock_matrix(fock_matrix):
+    """The Fock matrix is parsed as a list, while it should actually be
+    a square matrix, this function takes the list of finds the right dimensions
+    in order to reshape the matrix."""
+    total_elements = len(fock_matrix)
+    n_rows = int(np.sqrt(total_elements))
+    n_cols = n_rows
+
+    # Q-Chem splits the printing of the matrix into chunks of 6 elements
+    # per line. TODO: Is there a better way than to hard-code this?
+    chunks = 6 * n_rows
+    # Decide the indices of the chunks
+    chunk_indices = np.arange(chunks, total_elements, chunks)
+    # Split the arrays into the chunks
+    fock_matrix_chunks = np.split(fock_matrix, chunk_indices)
+
+    # Reshape the chunks into the matrix and populate the matrix
+    fock_matrix_reshaped = np.zeros(shape=(n_rows, n_cols), dtype=float)
+    index_cols = 0
+    for fock_matrix_chunk in fock_matrix_chunks:
+        n_cols_chunks = len(fock_matrix_chunk) / n_rows
+        n_cols_chunks = int(n_cols_chunks)
+        fock_matrix_chunk_reshaped = np.reshape(fock_matrix_chunk, (n_rows, n_cols_chunks))
+        fock_matrix_reshaped[:, index_cols : index_cols + n_cols_chunks] = fock_matrix_chunk_reshaped
+        index_cols += n_cols_chunks
+
+    return fock_matrix_reshaped