Skip to content

Commit

Permalink
Create chemical record from SMILES using rdkit (#191)
Browse files Browse the repository at this point in the history
  • Loading branch information
prehner authored Oct 15, 2023
1 parent 5a8b844 commit 0fc8bfa
Show file tree
Hide file tree
Showing 6 changed files with 304 additions and 5 deletions.
2 changes: 2 additions & 0 deletions feos-core/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added optional `Phase` argument for constructors of dynamic properties of `DataSet`s. [#174](https://github.com/feos-org/feos/pull/174)
- Added `molar_weight` method to `Residual` trait. [#177](https://github.com/feos-org/feos/pull/177)
- Added molar versions for entropy, enthalpy, etc. for residual properties. [#177](https://github.com/feos-org/feos/pull/177)
- Python only: Added `SmartsRecord` and `ChemicalRecord.from_smiles` that uses rdkit to calculate segments and bonds from a SMILES code. [#191](https://github.com/feos-org/feos/pull/191)
- Python only: Added `from_smiles` and `from_json_smiles` to parameter objects that build parameters from SMILES codes using rdkit. [#191](https://github.com/feos-org/feos/pull/191)

### Changed
- Changed constructors of `Parameter` trait to return `Result`s. [#161](https://github.com/feos-org/feos/pull/161)
Expand Down
217 changes: 217 additions & 0 deletions feos-core/src/python/parameter/fragmentation.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
use super::{ParameterError, PyChemicalRecord, PyIdentifier};
use crate::parameter::{ChemicalRecord, Identifier};
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::BufReader;
use std::path::Path;

#[derive(Clone, Serialize, Deserialize)]
pub struct SmartsRecord {
group: String,
smarts: String,
#[serde(skip_serializing_if = "Option::is_none")]
max: Option<usize>,
}

impl SmartsRecord {
fn new(group: String, smarts: String, max: Option<usize>) -> Self {
Self { group, smarts, max }
}

/// Read a list of `SmartsRecord`s from a JSON file.
pub fn from_json<P: AsRef<Path>>(file: P) -> Result<Vec<Self>, ParameterError> {
Ok(serde_json::from_reader(BufReader::new(File::open(file)?))?)
}
}

impl std::fmt::Display for SmartsRecord {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"SmartsRecord(group={}, smarts={}",
self.group, self.smarts
)?;
if let Some(max) = self.max {
write!(f, ", max={}", max)?;
}
write!(f, ")")
}
}

#[pyclass(name = "SmartsRecord")]
#[derive(Clone)]
#[pyo3(text_signature = "(group, smarts, max=None)")]
pub struct PySmartsRecord(pub SmartsRecord);

#[pymethods]
impl PySmartsRecord {
#[new]
fn new(group: String, smarts: String, max: Option<usize>) -> Self {
Self(SmartsRecord::new(group, smarts, max))
}

fn __repr__(&self) -> PyResult<String> {
Ok(self.0.to_string())
}

/// Read a list of `SmartsRecord`s from a JSON file.
///
/// Parameters
/// ----------
/// path : str
/// Path to file containing the SMARTS records.
///
/// Returns
/// -------
/// [SmartsRecord]
#[staticmethod]
#[pyo3(text_signature = "(path)")]
pub fn from_json(path: &str) -> Result<Vec<Self>, ParameterError> {
Ok(SmartsRecord::from_json(path)?
.into_iter()
.map(Self)
.collect())
}
}

// This macro call is the only reason why SmartsRecord is not implemented as one
// single Python class.
impl_json_handling!(PySmartsRecord);

#[pymethods]
impl PyChemicalRecord {
#[staticmethod]
pub fn from_smiles(
py: Python<'_>,
identifier: &PyAny,
smarts: Vec<PySmartsRecord>,
) -> PyResult<Self> {
let identifier = if let Ok(smiles) = identifier.extract::<String>() {
Identifier::new(None, None, None, Some(&smiles), None, None)
} else if let Ok(identifier) = identifier.extract::<PyIdentifier>() {
identifier.0
} else {
return Err(PyErr::new::<PyValueError, _>(
"`identifier` must be a SMILES code or `Identifier` object.".to_string(),
));
};
let smiles = identifier
.smiles
.as_ref()
.expect("Missing SMILES in `Identifier`");
let (segments, bonds) = fragment_molecule(py, smiles, smarts)?;
let segments = segments.into_iter().map(|s| s.to_owned()).collect();
Ok(Self(ChemicalRecord::new(identifier, segments, Some(bonds))))
}
}

fn fragment_molecule(
py: Python<'_>,
smiles: &str,
smarts: Vec<PySmartsRecord>,
) -> PyResult<(Vec<String>, Vec<[usize; 2]>)> {
let chem = py.import("rdkit.Chem")?;
let mol = chem.call_method1("MolFromSmiles", (smiles,))?;
let atoms = mol.call_method0("GetNumHeavyAtoms")?.extract::<usize>()?;

// find the location of all fragment using the given smarts
let mut matches: HashMap<_, _> = smarts
.into_iter()
.map(|s| {
let m = chem.call_method1("MolFromSmarts", (s.0.smarts,))?;
let matches = mol
.call_method1("GetSubstructMatches", (m,))?
.extract::<Vec<&PyAny>>()?;
let mut matches: Vec<_> = matches
.into_iter()
.map(|m| m.extract::<Vec<usize>>())
.collect::<PyResult<_>>()?;
// Instead of just throwing an error at this point, just try to continue with the first max
// occurrences. For some cases (the ethers) this just means that the symetry of C-O-C is broken.
// If a necessary segment is eliminated the error will be thrown later.
if let Some(max) = s.0.max {
if matches.len() > max {
matches = matches[..max].to_vec();
}
}
Ok((s.0.group, matches))
})
.collect::<PyResult<_>>()?;

// Filter small segments that are covered by larger segments (also only required by the weird
// ether groups of Sauer et al.)
let large_segments: HashSet<_> = matches
.values()
.flatten()
.filter(|m| m.len() > 1)
.flatten()
.copied()
.collect();
matches
.iter_mut()
.for_each(|(_, m)| m.retain(|m| !(m.len() == 1 && large_segments.contains(&m[0]))));

let bonds = mol.call_method0("GetBonds")?;
let builtins = py.import("builtins")?;
let bonds = builtins
.call_method1("list", (bonds,))?
.extract::<Vec<&PyAny>>()?;
let bonds: Vec<_> = bonds
.into_iter()
.map(|b| {
Ok([
b.call_method0("GetBeginAtomIdx")?.extract::<usize>()?,
b.call_method0("GetEndAtomIdx")?.extract::<usize>()?,
])
})
.collect::<PyResult<_>>()?;

convert_matches(atoms, matches, bonds)
}

fn convert_matches(
atoms: usize,
matches: HashMap<String, Vec<Vec<usize>>>,
bonds: Vec<[usize; 2]>,
) -> PyResult<(Vec<String>, Vec<[usize; 2]>)> {
// check if every atom is captured by exactly one fragment
let identified_atoms: Vec<_> = matches
.values()
.flat_map(|v| v.iter().flat_map(|l| l.iter()))
.collect();
let unique_atoms: HashSet<_> = identified_atoms.iter().collect();
if unique_atoms.len() == identified_atoms.len() && unique_atoms.len() == atoms {
// Translate the atom indices to segment indices (some segments contain more than one atom)
let mut segment_indices: Vec<_> = matches
.into_iter()
.flat_map(|(group, l)| {
l.into_iter().map(move |mut k| {
k.sort();
(k, group.clone())
})
})
.collect();
segment_indices.sort();

let segment_map: Vec<_> = segment_indices
.iter()
.enumerate()
.flat_map(|(i, (k, _))| k.iter().map(move |_| i))
.collect();
let segments: Vec<_> = segment_indices.into_iter().map(|(_, g)| g).collect();

let bonds: Vec<_> = bonds
.into_iter()
.map(|[a, b]| [segment_map[a], segment_map[b]])
.filter(|[a, b]| a != b)
.collect();
return Ok((segments, bonds));
}

Err(PyErr::new::<PyValueError, _>(
"Molecule cannot be built from groups!",
))
}
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ impl PyBinarySegmentRecord {
/// [BinarySegmentRecord]
#[staticmethod]
#[pyo3(text_signature = "(path)")]
fn from_json(path: &str) -> Result<Vec<Self>, ParameterError> {
pub fn from_json(path: &str) -> Result<Vec<Self>, ParameterError> {
Ok(BinaryRecord::from_json(path)?
.into_iter()
.map(Self)
Expand Down Expand Up @@ -751,7 +751,7 @@ macro_rules! impl_parameter_from_segments {
chemical_records: Vec<PyChemicalRecord>,
segment_records: Vec<PySegmentRecord>,
binary_segment_records: Option<Vec<PyBinarySegmentRecord>>,
) -> Result<Self, ParameterError> {
) -> PyResult<Self> {
Ok(Self(Arc::new(<$parameter>::from_segments(
chemical_records.into_iter().map(|cr| cr.0).collect(),
segment_records.into_iter().map(|sr| sr.0).collect(),
Expand Down Expand Up @@ -784,7 +784,7 @@ macro_rules! impl_parameter_from_segments {
segments_path: String,
binary_path: Option<String>,
search_option: IdentifierOption,
) -> Result<Self, ParameterError> {
) -> PyResult<Self> {
Ok(Self(Arc::new(<$parameter>::from_json_segments(
&substances,
pure_path,
Expand All @@ -793,6 +793,76 @@ macro_rules! impl_parameter_from_segments {
search_option,
)?)))
}

/// Creates parameters from SMILES and segment records.
///
/// Requires an installation of rdkit.
///
/// Parameters
/// ----------
/// identifier : [str | Identifier]
/// A list of SMILES codes or [Identifier] objects.
/// smarts_records : [SmartsRecord]
/// A list of records containing the SMARTS codes used
/// to fragment the molecule.
/// segment_records : [SegmentRecord]
/// A list of records containing the parameters of
/// all individual segments.
/// binary_segment_records : [BinarySegmentRecord], optional
/// A list of binary segment-segment parameters.
#[staticmethod]
#[pyo3(text_signature = "(identifier, smarts_records, segment_records, binary_segment_records=None)")]
fn from_smiles(
py: Python<'_>,
identifier: Vec<&PyAny>,
smarts_records: Vec<PySmartsRecord>,
segment_records: Vec<PySegmentRecord>,
binary_segment_records: Option<Vec<PyBinarySegmentRecord>>,
) -> PyResult<Self> {
let chemical_records: Vec<_> = identifier
.into_iter()
.map(|i| PyChemicalRecord::from_smiles(py, i, smarts_records.clone()))
.collect::<PyResult<_>>()?;
Self::from_segments(chemical_records, segment_records, binary_segment_records)
}

/// Creates parameters from SMILES using segments from json file.
///
/// Requires an installation of rdkit.
///
/// Parameters
/// ----------
/// identifier : [str | Identifier]
/// A list of SMILES codes or [Identifier] objects.
/// smarts_path : str
/// Path to file containing SMARTS records.
/// segments_path : str
/// Path to file containing segment parameters.
/// binary_path : str, optional
/// Path to file containing binary segment-segment parameters.
#[staticmethod]
#[pyo3(
signature = (identifier, smarts_path, segments_path, binary_path=None),
text_signature = "(identifier, smarts_path, segments_path, binary_path=None)"
)]
fn from_json_smiles(
py: Python<'_>,
identifier: Vec<&PyAny>,
smarts_path: String,
segments_path: String,
binary_path: Option<String>,
) -> PyResult<Self> {
let smarts_records = PySmartsRecord::from_json(&smarts_path)?;
let segment_records = PySegmentRecord::from_json(&segments_path)?;
let binary_segment_records = binary_path.map(|p| PyBinarySegmentRecord::from_json(&p)).transpose()?;
Self::from_smiles(
py,
identifier,
smarts_records,
segment_records,
binary_segment_records,
)
}
}
};
}
Expand All @@ -815,3 +885,6 @@ macro_rules! impl_json_handling {
}
};
}

mod fragmentation;
pub use fragmentation::PySmartsRecord;
5 changes: 4 additions & 1 deletion src/gc_pcsaft/python/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ use crate::association::PyAssociationRecord;
use feos_core::parameter::{
BinaryRecord, IdentifierOption, ParameterError, ParameterHetero, SegmentRecord,
};
use feos_core::python::parameter::{PyBinarySegmentRecord, PyChemicalRecord, PyIdentifier};
use feos_core::python::parameter::{
PyBinarySegmentRecord, PyChemicalRecord, PyIdentifier, PySmartsRecord,
};
use feos_core::{impl_json_handling, impl_parameter_from_segments, impl_segment_record};
#[cfg(feature = "dft")]
use numpy::{PyArray2, ToPyArray};
Expand Down Expand Up @@ -149,6 +151,7 @@ pub fn gc_pcsaft(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_class::<PyIdentifier>()?;
m.add_class::<IdentifierOption>()?;
m.add_class::<PyChemicalRecord>()?;
m.add_class::<PySmartsRecord>()?;
m.add_class::<PyAssociationRecord>()?;

m.add_class::<PyGcPcSaftRecord>()?;
Expand Down
5 changes: 4 additions & 1 deletion src/pcsaft/parameters.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,10 @@ impl FromSegments<usize> for PcSaftRecord {
let dipole_segments: usize = segments.iter().filter_map(|(s, n)| s.mu.map(|_| n)).sum();
let assoc_segments: usize = segments
.iter()
.filter_map(|(s, n)| s.association_record.map(|_| n))
.filter_map(|(s, n)| {
s.association_record
.map(|r| (r.na * r.nb + r.nc) as usize * n)
})
.sum();
if polar_segments > 1 {
return Err(ParameterError::IncompatibleParameters(format!(
Expand Down
1 change: 1 addition & 0 deletions src/pcsaft/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ pub fn pcsaft(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_class::<PyIdentifier>()?;
m.add_class::<IdentifierOption>()?;
m.add_class::<PyChemicalRecord>()?;
m.add_class::<PySmartsRecord>()?;

m.add_class::<DQVariants>()?;
m.add_class::<PyPcSaftRecord>()?;
Expand Down

0 comments on commit 0fc8bfa

Please sign in to comment.