Fix sum formula parsing

* With certain combinations of elements a wrong sum formula was displayed in the main window
dkratzert · Sep 12, 2024 · e58e0c4 · e58e0c4
1 parent 5f9d287
commit e58e0c4
Show file tree

Hide file tree

Showing 3 changed files with 238 additions and 49 deletions.
diff --git a/finalcif/tools/chemparse.py b/finalcif/tools/chemparse.py
@@ -0,0 +1,204 @@
+"""
+Copyright (c) 2024 Grayson Boyer and Victor Ignatenko
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE."""
+
+import re
+from typing import Generator
+from typing import Any
+
+
+class ChemparseError(Exception):
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
+
+
+class ParenthesesMismatchError(ChemparseError):
+    def __init__(self, formula: str) -> None:
+        super().__init__(f"Open and closed parentheses mismatch in formula '{formula}'")
+
+
+class NestedParenthesesError(ChemparseError):
+    def __init__(self, formula: str) -> None:
+        super().__init__(f"Cannot parse nested parentheses in formula '{formula}'")
+
+
+class ClosedParenthesesBeforeOpenError(ChemparseError):
+    def __init__(self, formula: str) -> None:
+        super().__init__(f"Closed parentheses detected before open parentheses in formula '{formula}'")
+
+
+RE_SIGNED_NUMBER: str = r"(^(?=.)([+-]?([0-9]*)(\.([0-9]+))?)([eE][+-]?\d+)?)"
+RE_NUMBER: str = r"(^(?=.)(([0-9]*)(\.([0-9]+))?)([eE][+-]?\d+)?)"
+RE_LETTERS: str = r"^[a-zA-Z-+]+"
+
+
+# function to return index of all instances of a substring in a string
+
+
+def find_all(sub: str, a_str: str) -> Generator[int, Any, None]:
+    start: int = 0
+    while True:
+        start = a_str.find(sub, start)
+        if start == -1:
+            return
+        yield start
+        start += len(sub)  # use start += 1 to find overlapping matches
+
+
+# functions to parse elemental formulas (handles both floats and ints)
+def get_first_elem(formula: str) -> tuple[str, bool]:
+    needed_split: bool = False
+    for char in formula:
+        if formula.find(char) != 0 and (char.isupper() or char == "+" or char == "-"):
+            formula = formula.split(char)[0]
+            needed_split = True
+            return formula, needed_split
+
+        char_ind = list(find_all(char, formula))
+        if len(char_ind) > 1 and (char.isupper() or char == "+" or char == "-") and (
+                formula[1] == char or formula[1].islower()) and sum(
+            1 for c in formula[0:char_ind[1]] if c.isupper()) == 1:
+            formula = formula[0:char_ind[1]]
+            needed_split = True
+            return formula, needed_split
+
+    return formula, needed_split
+
+
+def inner_parse_formula(text: str) -> dict[str, float]:
+    formula_dict: dict[str, float] = {}
+    for _ in range(0, len(text)):
+        element = re.findall(RE_LETTERS, text)
+        if len(element) == 0:
+            break
+        else:
+            element, needed_split = get_first_elem(element[0])
+            text = text.replace(element, '', 1)
+            if needed_split:
+                number = 1.0
+            else:
+                try:
+                    number = float(re.findall(RE_SIGNED_NUMBER, text)[0][0])
+                except:
+                    number = 1.0
+                text = re.sub(RE_SIGNED_NUMBER, "", text)
+            if element not in list(formula_dict.keys()):
+                formula_dict[element] = number
+            else:
+                formula_dict[element] += number
+    return formula_dict
+
+
+def find_occurrences(s: str, ch: str) -> list[int]:
+    return [i for i, letter in enumerate(s) if letter == ch]
+
+
+def get_first_parenth_match(text: str) -> int:
+    position: int = -1
+    ch_number: int = 0
+    closed_parenth_count: int = 0
+    opened_parenth_count: int = 0
+    for ch in text:
+        if ch == '(':
+            opened_parenth_count += 1
+        elif ch == ')':
+            closed_parenth_count += 1
+            if opened_parenth_count == closed_parenth_count:
+                position = closed_parenth_count - 1
+                break
+        ch_number += 1
+
+    return position
+
+
+def parse_formula(text: str) -> dict[str, float]:
+
+    text = str(text)
+    text = text.replace("[", "(")
+    text = text.replace("]", ")")
+
+    # get indices of starting parentheses "(" and ending ")"
+    open_parenth_idx_list = find_occurrences(text, "(")
+    closed_parenth_idx_list = find_occurrences(text, ")")
+
+    if len(open_parenth_idx_list) != len(closed_parenth_idx_list):
+        raise ParenthesesMismatchError(text)
+
+    for i in range(0, len(open_parenth_idx_list) - 1):
+        # if open_parenth_idx_list[i+1] < closed_parenth_idx_list[i]:
+        #     raise NestedParenthesesError(text)
+        if closed_parenth_idx_list[i] < open_parenth_idx_list[i]:
+            raise ClosedParenthesesBeforeOpenError(text)
+        if i == len(open_parenth_idx_list) - 1:
+            if closed_parenth_idx_list[i + 1] < open_parenth_idx_list[i + 1]:
+                raise ClosedParenthesesBeforeOpenError(text)
+
+    seg_dict_list: list[dict[str, float]] = []
+    parenth_pairs_count = len(open_parenth_idx_list)
+    for _ in range(parenth_pairs_count):
+        text = str(text)
+        if len(text) <= 0:
+            break
+        if not '(' in text and not ')' in text:
+            break
+
+        # get indices of starting parentheses "(" and ending ")"
+        open_parenth_idx_list = find_occurrences(text, "(")
+        closed_parenth_idx_list = find_occurrences(text, ")")
+
+        first_parenth_match: int = get_first_parenth_match(text)
+        if first_parenth_match < 0:
+            raise ParenthesesMismatchError(text)
+        seg = text[open_parenth_idx_list[0]:closed_parenth_idx_list[first_parenth_match] + 1]
+
+        try:
+            number = float(re.findall(RE_SIGNED_NUMBER, text[closed_parenth_idx_list[first_parenth_match] + 1:])[0][0])
+        except:
+            number = 1
+
+        seg_no_parenth = seg[1:-1]
+        # nested_parenth:bool = False
+        if '(' in seg_no_parenth or ')' in seg_no_parenth:
+            seg_formula_dict = parse_formula(seg_no_parenth)
+            # nested_parenth = True
+
+        else:
+            seg_formula_dict = inner_parse_formula(seg_no_parenth)
+        seg_formula_dict_mult = {k: v * number for (k, v) in seg_formula_dict.items()}
+
+        endseg = re.sub(RE_NUMBER, "", text[closed_parenth_idx_list[first_parenth_match] + 1:])
+        # if not nested_parenth:
+        text = text[:open_parenth_idx_list[0]] + endseg
+        seg_dict_list.append(seg_formula_dict_mult)
+
+    if '(' in text in text:
+        seg_dict_list.append(parse_formula(text))
+    else:
+        seg_dict_list.append(inner_parse_formula(text))
+
+    # merge and sum all segments
+    if len(seg_dict_list) > 1:
+        start_dict = seg_dict_list[0]
+        for i in range(1, len(seg_dict_list)):
+            next_dict = seg_dict_list[i]
+            start_dict = {k: start_dict.get(k, 0) + next_dict.get(k, 0) for k in set(start_dict) | set(next_dict)}
+        return start_dict
+    else:
+        return seg_dict_list[0]
diff --git a/finalcif/tools/sumformula.py b/finalcif/tools/sumformula.py
@@ -1,49 +1,20 @@
 from typing import Union, Dict
 
-from finalcif.cif.atoms import atoms
+from finalcif.tools import chemparse
 
 
-def formula_str_to_dict(sumform: Union[str, bytes]) -> Dict[str, str]:
+def formula_str_to_dict(sumform: Union[str, bytes]) -> Dict[str, float]:
     """
     converts an atom name like C12 to the element symbol C
     Use this code to find the atoms while going through the character astream of a sumformula
     e.g. C12H6O3Mn7
     Find two-char atoms, them one-char, and see if numbers are in between.
     """
-    elements = [x.upper() for x in atoms]
-    atlist = {}
-    nums = []
-    try:
-        sumform = sumform.upper().replace(' ', '').replace('\n', '').replace('\r', '')
-    except AttributeError:
-        print('Error in formula_str_to_dict')
-        return atlist
+    chemical_formula = chemparse.parse_formula(sumform.replace(" ", ""))
+    return chemical_formula
 
-    def isnumber(el):
-        for x in el:
-            if x.isnumeric() or x == '.':
-                nums.append(x)
-            else:
-                # end of number
-                break
 
-    while sumform:
-        if sumform[0:2] in elements:  # The two-character elements
-            isnumber(sumform[2:])
-            atlist[sumform[0:2].capitalize()] = "".join(nums)
-            sumform = sumform[2 + len(nums):]
-            nums.clear()
-        elif sumform[0] in elements:
-            isnumber(sumform[1:])
-            atlist[sumform[0]] = "".join(nums)
-            sumform = sumform[1 + len(nums):]
-            nums.clear()
-        else:
-            raise KeyError
-    return atlist
-
-
-def sum_formula_to_html(sumform: Dict[str, str], break_after: int = 99) -> str:
+def sum_formula_to_html(sumform: Dict[str, float | int], break_after: int = 99) -> str:
     """
     Makes html formatted sum formula from dictionary.
     """

diff --git a/tests/test_sumform.py b/tests/test_sumform.py
@@ -6,49 +6,63 @@
 class MyTestCase(unittest.TestCase):
 
     def test_sumform1(self):
-        self.assertEqual({'S': '', 'Sn': ''}, formula_str_to_dict("SSn"))
+        self.assertEqual({'S': 1.0, 'Sn': 1.0}, formula_str_to_dict("SSn"))
 
     def test_sumform2(self):
-        self.assertEqual({'S': '1', 'Cl': ''}, formula_str_to_dict("S1Cl"))
+        self.assertEqual({'Cl': 1.0, 'S': 1.0}, formula_str_to_dict("S1Cl"))
 
     def test_sumform3(self):
-        self.assertEqual({'C': '12', 'H': '6', 'O': '3', 'Mn': '7'}, formula_str_to_dict("C12H6O3Mn7"))
+        self.assertEqual({'C': 12.0, 'H': 6.0, 'Mn': 7.0, 'O': 3.0}, formula_str_to_dict("C12H6O3Mn7"))
 
     def test_sumform4(self):
-        self.assertEqual({'C': '12', 'H': '60', 'O': '3', 'Mn': '7'}, formula_str_to_dict("C12 H60 O3 Mn7"))
+        self.assertEqual({'C': 12.0, 'H': 60.0, 'Mn': 7.0, 'O': 3.0}, formula_str_to_dict("C12 H60 O3 Mn7"))
 
     def test_sumform5(self):
-        self.assertEqual({'C': '12', 'H': '60', 'O': '3', 'Mn': '7'}, formula_str_to_dict("C12 H60 O3  Mn 7"))
+        self.assertEqual({'C': 12.0, 'H': 60.0, 'Mn': 7.0, 'O': 3.0}, formula_str_to_dict("C12 H60 O3  Mn 7"))
 
     def test_sumform6(self):
-        self.assertEqual({'C': '13', 'Cs': '12', 'H': '60', 'O': '3', 'Mn': '7'},
+        self.assertEqual({'C': 13.0, 'Cs': 12.0, 'H': 60.0, 'Mn': 7.0, 'O': 3.0},
                          formula_str_to_dict("C13Cs12 H60 O3  Mn 7"))
 
     def test_sumform7(self):
-        self.assertEqual({'C': '', 'H': '', 'Mn': ''}, formula_str_to_dict("CHMn\n"))
+        self.assertEqual({'C': 1.0, 'H': 1.0, 'Mn': 1.0}, formula_str_to_dict("CHMn\n"))
 
     def test_sumform8(self):
-        with self.assertRaises(KeyError):
-            formula_str_to_dict("Hallo")
+        self.assertEqual({'Hallo': 1.0}, formula_str_to_dict("Hallo"))
 
     def test_sumform9(self):
-        with self.assertRaises(KeyError):
-            formula_str_to_dict("H3O+")
+        self.assertEqual({'+': 1.0, 'H': 3.0, 'O': 1.0}, formula_str_to_dict("H3O+"))
 
     def test_sumform10(self):
-        self.assertEqual({'C': '4', 'H': '2.91', 'Al': '0.12', 'F': '4.36', 'Ni': '0.12', 'O': '0.48'},
+        self.assertEqual({'Al': 0.12, 'C': 4.0, 'F': 4.36, 'H': 2.91, 'Ni': 0.12, 'O': 0.48},
                          formula_str_to_dict('C4 H2.91 Al0.12 F4.36 Ni0.12 O0.48'))
 
     def test_sumform11(self):
-        with self.assertRaises(KeyError):
-            formula_str_to_dict('C4H6O1*5H2O')
+        self.assertEqual({'C': 4.0, 'H': 8.0, 'O': 2.0}, formula_str_to_dict('C4H6O*5(H2O)'))
+
+    def test_sumform12(self):
+        self.assertEqual({'B': 1.0, 'C': 15.0, 'F': 2.0, 'H': 23.0, 'N': 2.0, 'O': 1.0, 'Si': 2.0},
+                         formula_str_to_dict('C15 H23 B F2 N2 O Si2'))
+
+    def test_sumform13(self):
+        self.assertEqual({'B': 1.0, 'C': 15.0, 'F': 2.0, 'H': 23.0, 'N': 2.0, 'O': 1.0, 'Si': 1.0},
+                         formula_str_to_dict('C15 H23 B F2 N2 O Si'))
+
+    def test_sumform14(self):
+        self.assertEqual({'B': 1.0, 'C': 15.0, 'F': 2.0, 'H': 23.0, 'I': 1.0, 'N': 2.0, 'Os': 1.0},
+                         formula_str_to_dict('C15 H23 B F2 N2 Os I'))
 
 
 class TestSumformHTLM(unittest.TestCase):
 
     def test_sumform_to_html(self):
         self.assertEqual('<html><body>C<sub>12</sub>H<sub>6</sub>O<sub>3</sub>Mn<sub>7</sub></body></html>',
-                         sum_formula_to_html({'C': '12', 'H': '6', 'O': '3', 'Mn': '7'}))
+                         sum_formula_to_html({'C': 12, 'H': 6, 'O': 3, 'Mn': 7}))
+
+    def test_sumform_to_html_with_difficult_elements(self):
+        self.assertEqual(
+            '<html><body>C<sub>15</sub>H<sub>23</sub>BF<sub>2</sub>N<sub>2</sub>OSi<sub>2</sub></body></html>',
+            sum_formula_to_html(formula_str_to_dict('C15H23BF2N2OSi2')))
 
 
 if __name__ == '__main__':