Skip to content

Commit

Permalink
Fix sum formula parsing
Browse files Browse the repository at this point in the history
* With certain combinations of elements a wrong sum formula was displayed in the main window
  • Loading branch information
dkratzert committed Sep 12, 2024
1 parent 5f9d287 commit e58e0c4
Show file tree
Hide file tree
Showing 3 changed files with 238 additions and 49 deletions.
204 changes: 204 additions & 0 deletions finalcif/tools/chemparse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
"""
Copyright (c) 2024 Grayson Boyer and Victor Ignatenko
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE."""

import re
from typing import Generator
from typing import Any


class ChemparseError(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)


class ParenthesesMismatchError(ChemparseError):
def __init__(self, formula: str) -> None:
super().__init__(f"Open and closed parentheses mismatch in formula '{formula}'")


class NestedParenthesesError(ChemparseError):
def __init__(self, formula: str) -> None:
super().__init__(f"Cannot parse nested parentheses in formula '{formula}'")


class ClosedParenthesesBeforeOpenError(ChemparseError):
def __init__(self, formula: str) -> None:
super().__init__(f"Closed parentheses detected before open parentheses in formula '{formula}'")


RE_SIGNED_NUMBER: str = r"(^(?=.)([+-]?([0-9]*)(\.([0-9]+))?)([eE][+-]?\d+)?)"
RE_NUMBER: str = r"(^(?=.)(([0-9]*)(\.([0-9]+))?)([eE][+-]?\d+)?)"
RE_LETTERS: str = r"^[a-zA-Z-+]+"


# function to return index of all instances of a substring in a string


def find_all(sub: str, a_str: str) -> Generator[int, Any, None]:
start: int = 0
while True:
start = a_str.find(sub, start)
if start == -1:
return
yield start
start += len(sub) # use start += 1 to find overlapping matches


# functions to parse elemental formulas (handles both floats and ints)
def get_first_elem(formula: str) -> tuple[str, bool]:
needed_split: bool = False
for char in formula:
if formula.find(char) != 0 and (char.isupper() or char == "+" or char == "-"):
formula = formula.split(char)[0]
needed_split = True
return formula, needed_split

char_ind = list(find_all(char, formula))
if len(char_ind) > 1 and (char.isupper() or char == "+" or char == "-") and (
formula[1] == char or formula[1].islower()) and sum(
1 for c in formula[0:char_ind[1]] if c.isupper()) == 1:
formula = formula[0:char_ind[1]]
needed_split = True
return formula, needed_split

return formula, needed_split


def inner_parse_formula(text: str) -> dict[str, float]:
formula_dict: dict[str, float] = {}
for _ in range(0, len(text)):
element = re.findall(RE_LETTERS, text)
if len(element) == 0:
break
else:
element, needed_split = get_first_elem(element[0])
text = text.replace(element, '', 1)
if needed_split:
number = 1.0
else:
try:
number = float(re.findall(RE_SIGNED_NUMBER, text)[0][0])
except:
number = 1.0
text = re.sub(RE_SIGNED_NUMBER, "", text)
if element not in list(formula_dict.keys()):
formula_dict[element] = number
else:
formula_dict[element] += number
return formula_dict


def find_occurrences(s: str, ch: str) -> list[int]:
return [i for i, letter in enumerate(s) if letter == ch]


def get_first_parenth_match(text: str) -> int:
position: int = -1
ch_number: int = 0
closed_parenth_count: int = 0
opened_parenth_count: int = 0
for ch in text:
if ch == '(':
opened_parenth_count += 1
elif ch == ')':
closed_parenth_count += 1
if opened_parenth_count == closed_parenth_count:
position = closed_parenth_count - 1
break
ch_number += 1

return position


def parse_formula(text: str) -> dict[str, float]:

text = str(text)
text = text.replace("[", "(")
text = text.replace("]", ")")

# get indices of starting parentheses "(" and ending ")"
open_parenth_idx_list = find_occurrences(text, "(")
closed_parenth_idx_list = find_occurrences(text, ")")

if len(open_parenth_idx_list) != len(closed_parenth_idx_list):
raise ParenthesesMismatchError(text)

for i in range(0, len(open_parenth_idx_list) - 1):
# if open_parenth_idx_list[i+1] < closed_parenth_idx_list[i]:
# raise NestedParenthesesError(text)
if closed_parenth_idx_list[i] < open_parenth_idx_list[i]:
raise ClosedParenthesesBeforeOpenError(text)
if i == len(open_parenth_idx_list) - 1:
if closed_parenth_idx_list[i + 1] < open_parenth_idx_list[i + 1]:
raise ClosedParenthesesBeforeOpenError(text)

seg_dict_list: list[dict[str, float]] = []
parenth_pairs_count = len(open_parenth_idx_list)
for _ in range(parenth_pairs_count):
text = str(text)
if len(text) <= 0:
break
if not '(' in text and not ')' in text:
break

# get indices of starting parentheses "(" and ending ")"
open_parenth_idx_list = find_occurrences(text, "(")
closed_parenth_idx_list = find_occurrences(text, ")")

first_parenth_match: int = get_first_parenth_match(text)
if first_parenth_match < 0:
raise ParenthesesMismatchError(text)
seg = text[open_parenth_idx_list[0]:closed_parenth_idx_list[first_parenth_match] + 1]

try:
number = float(re.findall(RE_SIGNED_NUMBER, text[closed_parenth_idx_list[first_parenth_match] + 1:])[0][0])
except:
number = 1

seg_no_parenth = seg[1:-1]
# nested_parenth:bool = False
if '(' in seg_no_parenth or ')' in seg_no_parenth:
seg_formula_dict = parse_formula(seg_no_parenth)
# nested_parenth = True

else:
seg_formula_dict = inner_parse_formula(seg_no_parenth)
seg_formula_dict_mult = {k: v * number for (k, v) in seg_formula_dict.items()}

endseg = re.sub(RE_NUMBER, "", text[closed_parenth_idx_list[first_parenth_match] + 1:])
# if not nested_parenth:
text = text[:open_parenth_idx_list[0]] + endseg
seg_dict_list.append(seg_formula_dict_mult)

if '(' in text in text:
seg_dict_list.append(parse_formula(text))
else:
seg_dict_list.append(inner_parse_formula(text))

# merge and sum all segments
if len(seg_dict_list) > 1:
start_dict = seg_dict_list[0]
for i in range(1, len(seg_dict_list)):
next_dict = seg_dict_list[i]
start_dict = {k: start_dict.get(k, 0) + next_dict.get(k, 0) for k in set(start_dict) | set(next_dict)}
return start_dict
else:
return seg_dict_list[0]
39 changes: 5 additions & 34 deletions finalcif/tools/sumformula.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,20 @@
from typing import Union, Dict

from finalcif.cif.atoms import atoms
from finalcif.tools import chemparse


def formula_str_to_dict(sumform: Union[str, bytes]) -> Dict[str, str]:
def formula_str_to_dict(sumform: Union[str, bytes]) -> Dict[str, float]:
"""
converts an atom name like C12 to the element symbol C
Use this code to find the atoms while going through the character astream of a sumformula
e.g. C12H6O3Mn7
Find two-char atoms, them one-char, and see if numbers are in between.
"""
elements = [x.upper() for x in atoms]
atlist = {}
nums = []
try:
sumform = sumform.upper().replace(' ', '').replace('\n', '').replace('\r', '')
except AttributeError:
print('Error in formula_str_to_dict')
return atlist
chemical_formula = chemparse.parse_formula(sumform.replace(" ", ""))
return chemical_formula

def isnumber(el):
for x in el:
if x.isnumeric() or x == '.':
nums.append(x)
else:
# end of number
break

while sumform:
if sumform[0:2] in elements: # The two-character elements
isnumber(sumform[2:])
atlist[sumform[0:2].capitalize()] = "".join(nums)
sumform = sumform[2 + len(nums):]
nums.clear()
elif sumform[0] in elements:
isnumber(sumform[1:])
atlist[sumform[0]] = "".join(nums)
sumform = sumform[1 + len(nums):]
nums.clear()
else:
raise KeyError
return atlist


def sum_formula_to_html(sumform: Dict[str, str], break_after: int = 99) -> str:
def sum_formula_to_html(sumform: Dict[str, float | int], break_after: int = 99) -> str:
"""
Makes html formatted sum formula from dictionary.
"""
Expand Down
44 changes: 29 additions & 15 deletions tests/test_sumform.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,49 +6,63 @@
class MyTestCase(unittest.TestCase):

def test_sumform1(self):
self.assertEqual({'S': '', 'Sn': ''}, formula_str_to_dict("SSn"))
self.assertEqual({'S': 1.0, 'Sn': 1.0}, formula_str_to_dict("SSn"))

def test_sumform2(self):
self.assertEqual({'S': '1', 'Cl': ''}, formula_str_to_dict("S1Cl"))
self.assertEqual({'Cl': 1.0, 'S': 1.0}, formula_str_to_dict("S1Cl"))

def test_sumform3(self):
self.assertEqual({'C': '12', 'H': '6', 'O': '3', 'Mn': '7'}, formula_str_to_dict("C12H6O3Mn7"))
self.assertEqual({'C': 12.0, 'H': 6.0, 'Mn': 7.0, 'O': 3.0}, formula_str_to_dict("C12H6O3Mn7"))

def test_sumform4(self):
self.assertEqual({'C': '12', 'H': '60', 'O': '3', 'Mn': '7'}, formula_str_to_dict("C12 H60 O3 Mn7"))
self.assertEqual({'C': 12.0, 'H': 60.0, 'Mn': 7.0, 'O': 3.0}, formula_str_to_dict("C12 H60 O3 Mn7"))

def test_sumform5(self):
self.assertEqual({'C': '12', 'H': '60', 'O': '3', 'Mn': '7'}, formula_str_to_dict("C12 H60 O3 Mn 7"))
self.assertEqual({'C': 12.0, 'H': 60.0, 'Mn': 7.0, 'O': 3.0}, formula_str_to_dict("C12 H60 O3 Mn 7"))

def test_sumform6(self):
self.assertEqual({'C': '13', 'Cs': '12', 'H': '60', 'O': '3', 'Mn': '7'},
self.assertEqual({'C': 13.0, 'Cs': 12.0, 'H': 60.0, 'Mn': 7.0, 'O': 3.0},
formula_str_to_dict("C13Cs12 H60 O3 Mn 7"))

def test_sumform7(self):
self.assertEqual({'C': '', 'H': '', 'Mn': ''}, formula_str_to_dict("CHMn\n"))
self.assertEqual({'C': 1.0, 'H': 1.0, 'Mn': 1.0}, formula_str_to_dict("CHMn\n"))

def test_sumform8(self):
with self.assertRaises(KeyError):
formula_str_to_dict("Hallo")
self.assertEqual({'Hallo': 1.0}, formula_str_to_dict("Hallo"))

def test_sumform9(self):
with self.assertRaises(KeyError):
formula_str_to_dict("H3O+")
self.assertEqual({'+': 1.0, 'H': 3.0, 'O': 1.0}, formula_str_to_dict("H3O+"))

def test_sumform10(self):
self.assertEqual({'C': '4', 'H': '2.91', 'Al': '0.12', 'F': '4.36', 'Ni': '0.12', 'O': '0.48'},
self.assertEqual({'Al': 0.12, 'C': 4.0, 'F': 4.36, 'H': 2.91, 'Ni': 0.12, 'O': 0.48},
formula_str_to_dict('C4 H2.91 Al0.12 F4.36 Ni0.12 O0.48'))

def test_sumform11(self):
with self.assertRaises(KeyError):
formula_str_to_dict('C4H6O1*5H2O')
self.assertEqual({'C': 4.0, 'H': 8.0, 'O': 2.0}, formula_str_to_dict('C4H6O*5(H2O)'))

def test_sumform12(self):
self.assertEqual({'B': 1.0, 'C': 15.0, 'F': 2.0, 'H': 23.0, 'N': 2.0, 'O': 1.0, 'Si': 2.0},
formula_str_to_dict('C15 H23 B F2 N2 O Si2'))

def test_sumform13(self):
self.assertEqual({'B': 1.0, 'C': 15.0, 'F': 2.0, 'H': 23.0, 'N': 2.0, 'O': 1.0, 'Si': 1.0},
formula_str_to_dict('C15 H23 B F2 N2 O Si'))

def test_sumform14(self):
self.assertEqual({'B': 1.0, 'C': 15.0, 'F': 2.0, 'H': 23.0, 'I': 1.0, 'N': 2.0, 'Os': 1.0},
formula_str_to_dict('C15 H23 B F2 N2 Os I'))


class TestSumformHTLM(unittest.TestCase):

def test_sumform_to_html(self):
self.assertEqual('<html><body>C<sub>12</sub>H<sub>6</sub>O<sub>3</sub>Mn<sub>7</sub></body></html>',
sum_formula_to_html({'C': '12', 'H': '6', 'O': '3', 'Mn': '7'}))
sum_formula_to_html({'C': 12, 'H': 6, 'O': 3, 'Mn': 7}))

def test_sumform_to_html_with_difficult_elements(self):
self.assertEqual(
'<html><body>C<sub>15</sub>H<sub>23</sub>BF<sub>2</sub>N<sub>2</sub>OSi<sub>2</sub></body></html>',
sum_formula_to_html(formula_str_to_dict('C15H23BF2N2OSi2')))


if __name__ == '__main__':
Expand Down

0 comments on commit e58e0c4

Please sign in to comment.