Create chemical record from SMILES using rdkit (#191)

feos-org · Oct 15, 2023 · 0fc8bfa · 0fc8bfa
1 parent 5a8b844
commit 0fc8bfa
Show file tree

Hide file tree

Showing 6 changed files with 304 additions and 5 deletions.
diff --git a/feos-core/CHANGELOG.md b/feos-core/CHANGELOG.md
@@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added optional `Phase` argument for constructors of dynamic properties of `DataSet`s.  [#174](https://github.com/feos-org/feos/pull/174)
 - Added `molar_weight` method to `Residual` trait. [#177](https://github.com/feos-org/feos/pull/177)
 - Added molar versions for entropy, enthalpy, etc. for residual properties. [#177](https://github.com/feos-org/feos/pull/177)
+- Python only: Added `SmartsRecord` and `ChemicalRecord.from_smiles` that uses rdkit to calculate segments and bonds from a SMILES code. [#191](https://github.com/feos-org/feos/pull/191)
+- Python only: Added `from_smiles` and `from_json_smiles` to parameter objects that build parameters from SMILES codes using rdkit. [#191](https://github.com/feos-org/feos/pull/191)
 
 ### Changed
 - Changed constructors of `Parameter` trait to return `Result`s. [#161](https://github.com/feos-org/feos/pull/161)

diff --git a/feos-core/src/python/parameter/fragmentation.rs b/feos-core/src/python/parameter/fragmentation.rs
@@ -0,0 +1,217 @@
+use super::{ParameterError, PyChemicalRecord, PyIdentifier};
+use crate::parameter::{ChemicalRecord, Identifier};
+use pyo3::exceptions::PyValueError;
+use pyo3::prelude::*;
+use serde::{Deserialize, Serialize};
+use std::collections::{HashMap, HashSet};
+use std::fs::File;
+use std::io::BufReader;
+use std::path::Path;
+
+#[derive(Clone, Serialize, Deserialize)]
+pub struct SmartsRecord {
+    group: String,
+    smarts: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    max: Option<usize>,
+}
+
+impl SmartsRecord {
+    fn new(group: String, smarts: String, max: Option<usize>) -> Self {
+        Self { group, smarts, max }
+    }
+
+    /// Read a list of `SmartsRecord`s from a JSON file.
+    pub fn from_json<P: AsRef<Path>>(file: P) -> Result<Vec<Self>, ParameterError> {
+        Ok(serde_json::from_reader(BufReader::new(File::open(file)?))?)
+    }
+}
+
+impl std::fmt::Display for SmartsRecord {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "SmartsRecord(group={}, smarts={}",
+            self.group, self.smarts
+        )?;
+        if let Some(max) = self.max {
+            write!(f, ", max={}", max)?;
+        }
+        write!(f, ")")
+    }
+}
+
+#[pyclass(name = "SmartsRecord")]
+#[derive(Clone)]
+#[pyo3(text_signature = "(group, smarts, max=None)")]
+pub struct PySmartsRecord(pub SmartsRecord);
+
+#[pymethods]
+impl PySmartsRecord {
+    #[new]
+    fn new(group: String, smarts: String, max: Option<usize>) -> Self {
+        Self(SmartsRecord::new(group, smarts, max))
+    }
+
+    fn __repr__(&self) -> PyResult<String> {
+        Ok(self.0.to_string())
+    }
+
+    /// Read a list of `SmartsRecord`s from a JSON file.
+    ///
+    /// Parameters
+    /// ----------
+    /// path : str
+    ///     Path to file containing the SMARTS records.
+    ///
+    /// Returns
+    /// -------
+    /// [SmartsRecord]
+    #[staticmethod]
+    #[pyo3(text_signature = "(path)")]
+    pub fn from_json(path: &str) -> Result<Vec<Self>, ParameterError> {
+        Ok(SmartsRecord::from_json(path)?
+            .into_iter()
+            .map(Self)
+            .collect())
+    }
+}
+
+// This macro call is the only reason why SmartsRecord is not implemented as one
+// single Python class.
+impl_json_handling!(PySmartsRecord);
+
+#[pymethods]
+impl PyChemicalRecord {
+    #[staticmethod]
+    pub fn from_smiles(
+        py: Python<'_>,
+        identifier: &PyAny,
+        smarts: Vec<PySmartsRecord>,
+    ) -> PyResult<Self> {
+        let identifier = if let Ok(smiles) = identifier.extract::<String>() {
+            Identifier::new(None, None, None, Some(&smiles), None, None)
+        } else if let Ok(identifier) = identifier.extract::<PyIdentifier>() {
+            identifier.0
+        } else {
+            return Err(PyErr::new::<PyValueError, _>(
+                "`identifier` must be a SMILES code or `Identifier` object.".to_string(),
+            ));
+        };
+        let smiles = identifier
+            .smiles
+            .as_ref()
+            .expect("Missing SMILES in `Identifier`");
+        let (segments, bonds) = fragment_molecule(py, smiles, smarts)?;
+        let segments = segments.into_iter().map(|s| s.to_owned()).collect();
+        Ok(Self(ChemicalRecord::new(identifier, segments, Some(bonds))))
+    }
+}
+
+fn fragment_molecule(
+    py: Python<'_>,
+    smiles: &str,
+    smarts: Vec<PySmartsRecord>,
+) -> PyResult<(Vec<String>, Vec<[usize; 2]>)> {
+    let chem = py.import("rdkit.Chem")?;
+    let mol = chem.call_method1("MolFromSmiles", (smiles,))?;
+    let atoms = mol.call_method0("GetNumHeavyAtoms")?.extract::<usize>()?;
+
+    // find the location of all fragment using the given smarts
+    let mut matches: HashMap<_, _> = smarts
+        .into_iter()
+        .map(|s| {
+            let m = chem.call_method1("MolFromSmarts", (s.0.smarts,))?;
+            let matches = mol
+                .call_method1("GetSubstructMatches", (m,))?
+                .extract::<Vec<&PyAny>>()?;
+            let mut matches: Vec<_> = matches
+                .into_iter()
+                .map(|m| m.extract::<Vec<usize>>())
+                .collect::<PyResult<_>>()?;
+            // Instead of just throwing an error at this point, just try to continue with the first max
+            // occurrences. For some cases (the ethers) this just means that the symetry of C-O-C is broken.
+            // If a necessary segment is eliminated the error will be thrown later.
+            if let Some(max) = s.0.max {
+                if matches.len() > max {
+                    matches = matches[..max].to_vec();
+                }
+            }
+            Ok((s.0.group, matches))
+        })
+        .collect::<PyResult<_>>()?;
+
+    // Filter small segments that are covered by larger segments (also only required by the weird
+    // ether groups of Sauer et al.)
+    let large_segments: HashSet<_> = matches
+        .values()
+        .flatten()
+        .filter(|m| m.len() > 1)
+        .flatten()
+        .copied()
+        .collect();
+    matches
+        .iter_mut()
+        .for_each(|(_, m)| m.retain(|m| !(m.len() == 1 && large_segments.contains(&m[0]))));
+
+    let bonds = mol.call_method0("GetBonds")?;
+    let builtins = py.import("builtins")?;
+    let bonds = builtins
+        .call_method1("list", (bonds,))?
+        .extract::<Vec<&PyAny>>()?;
+    let bonds: Vec<_> = bonds
+        .into_iter()
+        .map(|b| {
+            Ok([
+                b.call_method0("GetBeginAtomIdx")?.extract::<usize>()?,
+                b.call_method0("GetEndAtomIdx")?.extract::<usize>()?,
+            ])
+        })
+        .collect::<PyResult<_>>()?;
+
+    convert_matches(atoms, matches, bonds)
+}
+
+fn convert_matches(
+    atoms: usize,
+    matches: HashMap<String, Vec<Vec<usize>>>,
+    bonds: Vec<[usize; 2]>,
+) -> PyResult<(Vec<String>, Vec<[usize; 2]>)> {
+    // check if every atom is captured by exactly one fragment
+    let identified_atoms: Vec<_> = matches
+        .values()
+        .flat_map(|v| v.iter().flat_map(|l| l.iter()))
+        .collect();
+    let unique_atoms: HashSet<_> = identified_atoms.iter().collect();
+    if unique_atoms.len() == identified_atoms.len() && unique_atoms.len() == atoms {
+        // Translate the atom indices to segment indices (some segments contain more than one atom)
+        let mut segment_indices: Vec<_> = matches
+            .into_iter()
+            .flat_map(|(group, l)| {
+                l.into_iter().map(move |mut k| {
+                    k.sort();
+                    (k, group.clone())
+                })
+            })
+            .collect();
+        segment_indices.sort();
+
+        let segment_map: Vec<_> = segment_indices
+            .iter()
+            .enumerate()
+            .flat_map(|(i, (k, _))| k.iter().map(move |_| i))
+            .collect();
+        let segments: Vec<_> = segment_indices.into_iter().map(|(_, g)| g).collect();
+
+        let bonds: Vec<_> = bonds
+            .into_iter()
+            .map(|[a, b]| [segment_map[a], segment_map[b]])
+            .filter(|[a, b]| a != b)
+            .collect();
+        return Ok((segments, bonds));
+    }
+
+    Err(PyErr::new::<PyValueError, _>(
+        "Molecule cannot be built from groups!",
+    ))
+}
diff --git a/feos-core/src/python/parameter.rs → feos-core/src/python/parameter/mod.rs b/feos-core/src/python/parameter.rs → feos-core/src/python/parameter/mod.rs
@@ -330,7 +330,7 @@ impl PyBinarySegmentRecord {
     /// [BinarySegmentRecord]
     #[staticmethod]
     #[pyo3(text_signature = "(path)")]
-    fn from_json(path: &str) -> Result<Vec<Self>, ParameterError> {
+    pub fn from_json(path: &str) -> Result<Vec<Self>, ParameterError> {
         Ok(BinaryRecord::from_json(path)?
             .into_iter()
             .map(Self)
@@ -751,7 +751,7 @@ macro_rules! impl_parameter_from_segments {
                 chemical_records: Vec<PyChemicalRecord>,
                 segment_records: Vec<PySegmentRecord>,
                 binary_segment_records: Option<Vec<PyBinarySegmentRecord>>,
-            ) -> Result<Self, ParameterError> {
+            ) -> PyResult<Self> {
                 Ok(Self(Arc::new(<$parameter>::from_segments(
                     chemical_records.into_iter().map(|cr| cr.0).collect(),
                     segment_records.into_iter().map(|sr| sr.0).collect(),
@@ -784,7 +784,7 @@ macro_rules! impl_parameter_from_segments {
                 segments_path: String,
                 binary_path: Option<String>,
                 search_option: IdentifierOption,
-            ) -> Result<Self, ParameterError> {
+            ) -> PyResult<Self> {
                 Ok(Self(Arc::new(<$parameter>::from_json_segments(
                     &substances,
                     pure_path,
@@ -793,6 +793,76 @@ macro_rules! impl_parameter_from_segments {
                     search_option,
                 )?)))
             }
+
+            /// Creates parameters from SMILES and segment records.
+            ///
+            /// Requires an installation of rdkit.
+            ///
+            /// Parameters
+            /// ----------
+            /// identifier : [str | Identifier]
+            ///     A list of SMILES codes or [Identifier] objects.
+            /// smarts_records : [SmartsRecord]
+            ///     A list of records containing the SMARTS codes used
+            ///     to fragment the molecule.
+            /// segment_records : [SegmentRecord]
+            ///     A list of records containing the parameters of
+            ///     all individual segments.
+            /// binary_segment_records : [BinarySegmentRecord], optional
+            ///     A list of binary segment-segment parameters.
+            #[staticmethod]
+            #[pyo3(text_signature = "(identifier, smarts_records, segment_records, binary_segment_records=None)")]
+            fn from_smiles(
+                py: Python<'_>,
+                identifier: Vec<&PyAny>,
+                smarts_records: Vec<PySmartsRecord>,
+                segment_records: Vec<PySegmentRecord>,
+                binary_segment_records: Option<Vec<PyBinarySegmentRecord>>,
+            ) -> PyResult<Self> {
+                let chemical_records: Vec<_> = identifier
+                    .into_iter()
+                    .map(|i| PyChemicalRecord::from_smiles(py, i, smarts_records.clone()))
+                    .collect::<PyResult<_>>()?;
+                Self::from_segments(chemical_records, segment_records, binary_segment_records)
+            }
+
+            /// Creates parameters from SMILES using segments from json file.
+            ///
+            /// Requires an installation of rdkit.
+            ///
+            /// Parameters
+            /// ----------
+            /// identifier : [str | Identifier]
+            ///     A list of SMILES codes or [Identifier] objects.
+            /// smarts_path : str
+            ///     Path to file containing SMARTS records.
+            /// segments_path : str
+            ///     Path to file containing segment parameters.
+            /// binary_path : str, optional
+            ///     Path to file containing binary segment-segment parameters.
+            #[staticmethod]
+            #[pyo3(
+                signature = (identifier, smarts_path, segments_path, binary_path=None),
+                text_signature = "(identifier, smarts_path, segments_path, binary_path=None)"
+            )]
+            fn from_json_smiles(
+                py: Python<'_>,
+                identifier: Vec<&PyAny>,
+                smarts_path: String,
+                segments_path: String,
+                binary_path: Option<String>,
+            ) -> PyResult<Self> {
+                let smarts_records = PySmartsRecord::from_json(&smarts_path)?;
+                let segment_records = PySegmentRecord::from_json(&segments_path)?;
+                let binary_segment_records = binary_path.map(|p| PyBinarySegmentRecord::from_json(&p)).transpose()?;
+                Self::from_smiles(
+                    py,
+                    identifier,
+                    smarts_records,
+                    segment_records,
+                    binary_segment_records,
+                )
+            }
         }
     };
 }
@@ -815,3 +885,6 @@ macro_rules! impl_json_handling {
         }
     };
 }
+
+mod fragmentation;
+pub use fragmentation::PySmartsRecord;
diff --git a/src/gc_pcsaft/python/mod.rs b/src/gc_pcsaft/python/mod.rs
@@ -6,7 +6,9 @@ use crate::association::PyAssociationRecord;
 use feos_core::parameter::{
     BinaryRecord, IdentifierOption, ParameterError, ParameterHetero, SegmentRecord,
 };
-use feos_core::python::parameter::{PyBinarySegmentRecord, PyChemicalRecord, PyIdentifier};
+use feos_core::python::parameter::{
+    PyBinarySegmentRecord, PyChemicalRecord, PyIdentifier, PySmartsRecord,
+};
 use feos_core::{impl_json_handling, impl_parameter_from_segments, impl_segment_record};
 #[cfg(feature = "dft")]
 use numpy::{PyArray2, ToPyArray};
@@ -149,6 +151,7 @@ pub fn gc_pcsaft(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
     m.add_class::<PyIdentifier>()?;
     m.add_class::<IdentifierOption>()?;
     m.add_class::<PyChemicalRecord>()?;
+    m.add_class::<PySmartsRecord>()?;
     m.add_class::<PyAssociationRecord>()?;
 
     m.add_class::<PyGcPcSaftRecord>()?;

diff --git a/src/pcsaft/parameters.rs b/src/pcsaft/parameters.rs
@@ -177,7 +177,10 @@ impl FromSegments<usize> for PcSaftRecord {
         let dipole_segments: usize = segments.iter().filter_map(|(s, n)| s.mu.map(|_| n)).sum();
         let assoc_segments: usize = segments
             .iter()
-            .filter_map(|(s, n)| s.association_record.map(|_| n))
+            .filter_map(|(s, n)| {
+                s.association_record
+                    .map(|r| (r.na * r.nb + r.nc) as usize * n)
+            })
             .sum();
         if polar_segments > 1 {
             return Err(ParameterError::IncompatibleParameters(format!(

diff --git a/src/pcsaft/python.rs b/src/pcsaft/python.rs
@@ -211,6 +211,7 @@ pub fn pcsaft(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
     m.add_class::<PyIdentifier>()?;
     m.add_class::<IdentifierOption>()?;
     m.add_class::<PyChemicalRecord>()?;
+    m.add_class::<PySmartsRecord>()?;
 
     m.add_class::<DQVariants>()?;
     m.add_class::<PyPcSaftRecord>()?;