merge conflict

ekwan · Oct 5, 2020 · fa87090 · fa87090
2 parents e8811b1 + 89f38d4
commit fa87090
Show file tree

Hide file tree

Showing 8 changed files with 174 additions and 24 deletions.
diff --git a/cctk/ensemble.py b/cctk/ensemble.py
@@ -596,3 +596,16 @@ def get_geometric_parameters(self, parameter, atom1, atom2, atom3=None, atom4=No
 
         return output
 
+    def assign_connectivity(self, index=0):
+        """
+        Assigns connectivity for all molecules based on molecule of index ``index``. Much faster than assigning connectivity for each individually -- but assumes all bonding is the same.
+        """
+        assert isinstance(index, int), "Need integer index"
+        bonds = self.molecules[index].assign_connectivity().bonds
+
+        for mol in self.molecules:
+            mol.bonds = bonds
+
+        return self
+
+
diff --git a/cctk/molecule.py b/cctk/molecule.py
@@ -35,7 +35,6 @@ class Molecule:
         bonds (nx.Graph or list of tuples): connectivity graph or list of 2-tuples, with each element representing the 1-indexed atom number of a bonded pair
         charge (int): the charge of the molecule
         multiplicity (int): the spin state of the molecule (1 corresponds to singlet, 2 to doublet, 3 to triplet, etc. -- so a multiplicity of 1 is equivalent to S=0)
-        checks (bool): whether to check that the constructor parameters are valid
         vibrational_modes (list of cctk.VibrationalMode): vibrational modes
     """
 
@@ -44,6 +43,9 @@ def __init__(self, atomic_numbers, geometry, name=None, bonds=None, charge=0, mu
         Create new Molecule object, and assign connectivity if needed.
 
         ``bonds`` must be a list of edges (i.e. an n x 2 ``numpy`` array).
+
+        If ``checks`` is True, the atomic numbers in bonds will all be checked for consistency.
+        This option can be disabled by setting ``checks`` to False, but this is not recommended for external data.
         """
         if len(atomic_numbers) != len(geometry):
             raise ValueError(f"length of geometry ({len(geometry)}) and atomic_numbers ({len(atomic_numbers)}) does not match!\n{atomic_numbers}\n{geometry}")
@@ -1665,3 +1667,12 @@ def csearch(self, nprocs=1, constraints=[], logfile=None, noncovalent=False, use
         import cctk.optimize as opt
         assert isinstance(nprocs, int), "nprocs must be int!"
         return opt.csearch(molecule=self, nprocs=nprocs, constraints=constraints, noncovalent=noncovalent, logfile=logfile, use_tempdir=use_tempdir)
+
+    def num_neighbors_by_atom(self):
+        """
+        Returns a list of the number of neighbors of each atom.
+        """
+        result = []
+        for i in range(self.num_atoms()):
+            result.append(len(self.get_adjacent_atoms(i)))
+        return result
diff --git a/cctk/parse_gaussian.py b/cctk/parse_gaussian.py
@@ -11,34 +11,52 @@
 Functions to help with parsing Gaussian files
 """
 
-def read_nmr_shifts(lines, num_atoms):
+def read_nmr_shifts(blocks, num_atoms):
     """
     Helper method to search through output file and read NMR shifts.
     Args:
         lines (list): list of lines in file
         num_atoms (int): number of atoms expected
     Returns:
         list of isotropic NMR shifts (np.ndarray)
+        list of shielding tensors (list of 3x3 np.ndarray)
     """
     # assumes that lines only come from one Link1 section
     shieldings = []
-    for line in lines:
-        fields = line.split()
-        if len(fields) == 8 and fields[2] == "Isotropic" and fields[5] == "Anisotropy":
+    tensors = []
+    for block in blocks:
+        lines = block.split("\n")
+        tensor = np.zeros(shape=(3,3))
+        for line in lines:
             fields = line.split()
-            assert len(fields) == 8, f"Expected 8 fields on an NMR shielding output line but found {len(fields)} instead!"
-            try:
-                shielding = float(fields[4])
-            except:
-                raise ValueError(f"Error parsing NMR shielding output line:\n{line}")
-            shieldings.append(shielding)
+            # there are 8 on each line but we truncate the first 2 in the block selection process
+            if len(fields) == 6 and fields[0] == "Isotropic" and fields[3] == "Anisotropy":
+                fields = line.split()
+                assert len(fields) == 6, f"Expected 6 fields on an NMR shielding output line but found {len(fields)} instead!"
+                try:
+                    shielding = float(fields[2])
+                except:
+                    raise ValueError(f"Error parsing NMR shielding output line:\n{line}")
+                shieldings.append(shielding)
+
+        # yes, this is very elegant.
+        tensor[0][0] = float(re.search("XX=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensor[0][1] = float(re.search("XY=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensor[0][2] = float(re.search("XZ=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensor[1][0] = float(re.search("YX=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensor[1][1] = float(re.search("YY=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensor[1][2] = float(re.search("YZ=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensor[2][0] = float(re.search("ZX=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensor[2][1] = float(re.search("ZY=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensor[2][2] = float(re.search("ZZ=\s+(?P<val>-?\d+\.\d+)", block).group("val"))
+        tensors.append(tensor)
 
     if len(shieldings) is not 0:
         assert len(shieldings) == num_atoms, f"Expected {num_atoms} shieldings but found {len(shieldings)}!"
-        return np.asarray(shieldings)
+        return np.asarray(shieldings), tensors
     else:
         #### we can catch this problem later if the file is finished
-        return None
+        return None, None
 
 def split_link1(filename):
     """
@@ -106,6 +124,7 @@ def read_file_fast(file_text, filename, link1idx, max_len=10000, extended_opt_in
         ["Mulliken charges", "Sum of Mulliken charges", 1],
         ["Electronic spatial extent", "Quadrupole moment", 1], #10
         ["normal coordinates", "Thermochemistry", 1],
+        ["Isotropic", "Eigenvalues", 1000],
     ]
 
     word_matches = [[] for _ in words]
@@ -289,9 +308,10 @@ def read_file_fast(file_text, filename, link1idx, max_len=10000, extended_opt_in
             properties[-1]["quasiharmonic_gibbs_free_energy"] = float(corrected_free_energy)
 
     if cctk.GaussianJobType.NMR in job_types:
-        nmr_shifts = read_nmr_shifts(word_matches[16], molecules[0].num_atoms())
+        nmr_shifts, shielding_tensors = read_nmr_shifts(block_matches[12], molecules[0].num_atoms())
         if nmr_shifts is not None:
             properties[-1]["isotropic_shielding"] = nmr_shifts.view(cctk.OneIndexedArray)
+            properties[-1]["shielding_tensors"] = shielding_tensors
 
         if re.search("nmr=mixed", f.route_card, flags=re.IGNORECASE) or re.search("nmr=spinspin", f.route_card,flags=re.IGNORECASE):
             couplings = read_j_couplings(block_matches[6], molecules[0].num_atoms())
@@ -304,7 +324,7 @@ def read_file_fast(file_text, filename, link1idx, max_len=10000, extended_opt_in
         properties[0]["forces"] = forces
 
     if cctk.GaussianJobType.POP in job_types:
-        if re.search("hirshfeld", f.route_card) or re.search("cm5", f.route_card):
+        if re.search("hirshfeld", f.route_card) or re.search("cm5", f.route_card) and len(block_matches[8]) > 0:
             charges, spins = parse_hirshfeld(block_matches[8])
             properties[-1]["hirshfeld_charges"] = charges
             properties[-1]["hirshfeld_spins"] = spins
@@ -467,6 +487,10 @@ def parse_charges_dipole(mulliken_block, dipole_block):
 def parse_hirshfeld(hirshfeld_block):
     charges = []
     spins = []
+
+    if len(hirshfeld_block) == 0:
+        return None, None
+
     for line in hirshfeld_block[0].split("\n")[2:]:
         fields = re.split(" +", line)
         fields = list(filter(None, fields))
@@ -567,3 +591,75 @@ def parse_modes(freq_block, num_atoms, hpmodes=False):
         modes.append(cctk.VibrationalMode(frequency=f, reduced_mass=m, force_constant=k, displacements=d))
 
     return modes
+
+def read_j_couplings(lines, n_atoms):
+    """
+    Helper method to search through output file and read J couplings
+    Args:
+        lines (list): list of lines in file
+        n_atoms (int): how many atoms are in the molecule
+    Returns:
+        ``couplings`` symmetric 2D np.array of couplings (in Hz) with zero-indexed atoms on both axes
+        or None if no couplings were found
+    """
+    couplings = np.zeros((n_atoms,n_atoms))
+    n_full_blocks, lines_in_partial_block = divmod(n_atoms,5)
+    n_lines = 5 * (n_full_blocks * (n_full_blocks+1) / 2) + n_full_blocks + 1
+    if lines_in_partial_block > 0:
+        n_lines += 1 + lines_in_partial_block
+    n_lines = int(n_lines)
+
+    lines = lines[0].split("\n")
+
+    i = 0
+    read_column_indices = False
+    read_row = False
+    this_column_indices = []
+    while i < n_lines:
+        # get current line
+        line = lines[i]
+
+        # if this is the header, we should be reading the column indices next
+        if "Total nuclear spin-spin coupling J (Hz):" in line:
+            i += 1
+            read_column_indices = True
+            continue
+
+        # this is not the header, so split the fields
+        fields = line.split()
+
+        # read the column indices
+        if read_column_indices:
+#            this_n_columns = len(fields)
+            this_column_indices = [ int(j)-1 for j in fields ]
+            i += 1
+            read_column_indices = False
+            read_row = True
+            continue
+        elif read_row:
+            row = int(fields[0])-1
+            for j,value in enumerate(fields[1:]):
+                column = this_column_indices[j]
+                value = value.replace("D","E")
+                value = float(value)
+                couplings[row,column] = value
+                couplings[column,row] = value
+
+            # check if we have read the entire matrix
+            if row == n_atoms - 1 and column == n_atoms - 1:
+                break
+
+            # check if this is the end of the current block
+            if row == n_atoms - 1:
+                read_column_indices = True
+                read_row = False
+                i += 1
+                continue
+
+            read_row = True
+            i += 1
+            continue
+        else:
+            raise ValueError("impossible")
+
+    return couplings
diff --git a/docs/recipe_01.rst b/docs/recipe_01.rst
@@ -151,7 +151,7 @@ Using Custom Basis Sets from the Basis Set Exchange
 
 """""""""""""""""""""""""""
 Creating Molecules By Name
-""""""""""""""""""""""""""
+"""""""""""""""""""""""""""
 
 - If ``rdkit`` is installed, then molecules can be created from a name or SMILES string. Structures should be checked for sanity!
 

diff --git a/docs/recipe_06.rst b/docs/recipe_06.rst
@@ -8,7 +8,6 @@ NMR Spectroscopy
 Chemical Shifts
 """""""""""""""
 
-- ``import cctk`` is assumed.
 - Isotropic shieldings are read automatically.
 - The ``scale_nmr_shifts`` function will apply scaling factors to the shieldings
   to produce chemical shift predictions.
@@ -107,4 +106,23 @@ Coupling Constants
     self.assertTrue(np.any(expected_couplings-couplings < 0.1))
 
 
+""""""""""""""""""
+Shielding Tensors
+""""""""""""""""""
+
+- The 3 by 3 shielding tensors are stored in ``shielding_tensors`` in ``properties_dict``.
+- Each tensor is a ``np.ndarray`` (0-indexed), stored in a 0-indexed list.
+
+::
+
+    gaussian_file = cctk.GaussianFile.read_file("test/static/methane.out")
+    ensemble = gaussian_file.ensemble
+    shielding_tensors = ensemble[-1,"shielding_tensors"]
+    print(shielding_tensors(1))
+
+    # will print
+    # array([[32.6869,  3.0191,  3.0191],
+    #   [ 3.0191, 32.6869,  3.0191],
+    #   [ 3.0191,  3.0191, 32.6869]]
+
 
diff --git a/docs/recipe_08.rst b/docs/recipe_08.rst
@@ -99,7 +99,7 @@ Epimerization
 
 """""""""""""""""
 Detecting Groups
-""""""""""""""""
+"""""""""""""""""
 
 - *cctk* can search within existing molecules for groups and return the relevant atom numbers.
 

diff --git a/scripts/analyze.py b/scripts/analyze.py
@@ -41,6 +41,7 @@ def main():
     parser.add_argument("--h", action="store_true")
     parser.add_argument("--standard_state", action="store_true", default=False)
     parser.add_argument("--correct_gibbs", action="store_true", default=False)
+    parser.add_argument("--sort", action="store_true", default=False)
     parser.add_argument("filename")
     args = vars(parser.parse_args(sys.argv[1:]))
 
@@ -83,23 +84,30 @@ def main():
     if not isinstance(values[0], list):
         values = [values]
 
-    df = pd.DataFrame(values, columns=property_names).fillna("")
+    df = pd.DataFrame(values, columns=property_names)
     df.sort_values("filename", inplace=True)
 
+    if args["correct_gibbs"]:
+        df.rename(columns={"quasiharmonic_gibbs_free_energy": "gibbs_free_energy"}, inplace=True)
+
     if args["g"]:
-        df["rel_energy"] = (df.quasiharmonic_gibbs_free_energy - df.quasiharmonic_gibbs_free_energy.min()) * 627.509469
+        print(df.columns)
+        df = df.dropna(subset=["gibbs_free_energy"])
+        df["rel_energy"] = (df.gibbs_free_energy - df.gibbs_free_energy.min()) * 627.509469
     elif args["h"]:
+        df = df.dropna(subset=["enthalpy"])
         df["rel_energy"] = (df.enthalpy - df.enthalpy.min()) * 627.509469
     else:
         try:
             df["rel_energy"] = (df.energy - df.energy.min()) * 627.509469
         except:
             df["rel_energy"] = 0
 
-    if args["correct_gibbs"]:
-        df.rename(columns={"rms_displacement": "rms_disp", "quasiharmonic_gibbs_free_energy": "GFE"}, inplace=True)
-    else:
-        df.rename(columns={"rms_displacement": "rms_disp", "gibbs_free_energy": "GFE"}, inplace=True)
+    if args["sort"]:
+        df.sort_values("rel_energy", inplace=True)
+
+    df.fillna("")
+    df.rename(columns={"rms_displacement": "rms_disp", "gibbs_free_energy": "GFE"}, inplace=True)
 
     if args["standard_state"]:
         # convert to 1 M standard state = 6.354 e.u. * 298 K / 4184 J/kcal

diff --git a/test/test_nmr.py b/test/test_nmr.py
@@ -21,6 +21,10 @@ def test_nmr1(self):
         shieldings = ensemble[:,"isotropic_shielding"]
         self.assertListEqual(list(shieldings), [198.2259, 32.6869, 32.6869, 32.6869, 32.6869])
 
+        tensors = ensemble[:, "shielding_tensors"]
+        self.assertEqual(tensors[0][0][0], 198.2259)
+        self.assertEqual(tensors[1][0][0], 32.6869)
+
     def test_nmr2(self):
         # this file contains opt freq followed by Link1 NMR on methane
         gaussian_file = cctk.GaussianFile.read_file("test/static/methane2.out")