diff --git a/include/nuri/algo/guess.h b/include/nuri/algo/guess.h index 7b084b77..d9955fd5 100644 --- a/include/nuri/algo/guess.h +++ b/include/nuri/algo/guess.h @@ -6,6 +6,8 @@ #ifndef NURI_ALGO_GUESS_H_ #define NURI_ALGO_GUESS_H_ +#include + #include "nuri/core/molecule.h" namespace nuri { @@ -26,8 +28,9 @@ constexpr double kDefaultThreshold = 0.5; * If connectivity information is already present and is correct, consider using * guess_all_types(). */ -extern bool guess_everything(MoleculeMutator &mut, int conf = 0, - double threshold = kDefaultThreshold); +ABSL_MUST_USE_RESULT extern bool +guess_everything(MoleculeMutator &mut, int conf = 0, + double threshold = kDefaultThreshold); /** * @brief Guess connectivity information of a molecule. @@ -53,7 +56,7 @@ extern void guess_connectivity(MoleculeMutator &mut, int conf = 0, * and all atom/bond types and implicit hydrogen counts are incorrect. The * information present in the molecule could be overwritten by this function. */ -extern bool guess_all_types(Molecule &mol, int conf = 0); +ABSL_MUST_USE_RESULT extern bool guess_all_types(Molecule &mol, int conf = 0); /** * @brief Guess formal charges of a molecule. @@ -84,6 +87,10 @@ extern void guess_hydrogens_2d(Molecule &mol); * guess_hydrogens() in sequence. */ extern void guess_fcharge_hydrogens_2d(Molecule &mol); + +namespace internal { + ABSL_MUST_USE_RESULT extern bool guess_update_subs(Molecule &mol); +} // namespace internal } // namespace nuri #endif /* NURI_ALGO_GUESS_H_ */ diff --git a/include/nuri/fmt/base.h b/include/nuri/fmt/base.h index 484ff581..23049fbb 100644 --- a/include/nuri/fmt/base.h +++ b/include/nuri/fmt/base.h @@ -129,7 +129,7 @@ class MoleculeReader { */ virtual Molecule parse(const std::vector &block) const = 0; - virtual bool sanitized() const = 0; + virtual bool bond_valid() const = 0; MoleculeStream stream() { return { *this }; } }; diff --git a/include/nuri/fmt/mol2.h b/include/nuri/fmt/mol2.h index da1602da..3f0b623e 100644 --- a/include/nuri/fmt/mol2.h +++ b/include/nuri/fmt/mol2.h @@ -31,7 +31,7 @@ class Mol2Reader: public DefaultReaderImpl { bool getnext(std::vector &block) override; - bool sanitized() const override { return false; } + bool bond_valid() const override { return true; } private: bool read_mol_header_ = false; diff --git a/include/nuri/fmt/pdb.h b/include/nuri/fmt/pdb.h index d18a29c4..1c0cdfbb 100644 --- a/include/nuri/fmt/pdb.h +++ b/include/nuri/fmt/pdb.h @@ -31,7 +31,7 @@ class PDBReader: public DefaultReaderImpl { bool getnext(std::vector &block) override; - bool sanitized() const override { return true; } + bool bond_valid() const override { return false; } private: std::vector header_; diff --git a/include/nuri/fmt/sdf.h b/include/nuri/fmt/sdf.h index 4c06dd0d..84b0546e 100644 --- a/include/nuri/fmt/sdf.h +++ b/include/nuri/fmt/sdf.h @@ -31,7 +31,7 @@ class SDFReader: public DefaultReaderImpl { bool getnext(std::vector &block) override; - bool sanitized() const override { return false; } + bool bond_valid() const override { return true; } }; class SDFReaderFactory: public DefaultReaderFactoryImpl { diff --git a/include/nuri/fmt/smiles.h b/include/nuri/fmt/smiles.h index 8fd4cc7e..a21826eb 100644 --- a/include/nuri/fmt/smiles.h +++ b/include/nuri/fmt/smiles.h @@ -33,7 +33,7 @@ class SmilesReader: public DefaultReaderImpl { bool getnext(std::vector &block) override; - bool sanitized() const override { return false; } + bool bond_valid() const override { return true; } }; class SmilesReaderFactory: public DefaultReaderFactoryImpl { diff --git a/python/docs/conf.py.in b/python/docs/conf.py.in index cdd0df8f..9a2d35df 100644 --- a/python/docs/conf.py.in +++ b/python/docs/conf.py.in @@ -8,9 +8,9 @@ # For the full list of built-in configuration values, see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html +import doctest import re -import doctest from sphinx.ext import autodoc # -- Project information ----------------------------------------------------- @@ -61,10 +61,10 @@ html_static_path = ["@CMAKE_CURRENT_LIST_DIR@/_static"] if "@DOXYGEN_OUTPUT_DIR@": html_extra_path = ["@DOXYGEN_OUTPUT_DIR@/html"] html_css_files = [ - "css/rtd-property.css" # workaround https://github.com/readthedocs/sphinx_rtd_theme/issues/1301 + "css/rtd-property.css" # workaround https://github.com/readthedocs/sphinx_rtd_theme/issues/1301 ] html_theme_options = { - 'navigation_depth': -1, + "navigation_depth": -1, } intersphinx_mapping = { @@ -83,7 +83,8 @@ doctest_global_setup = "import nuri" class UnneededBaseStripDocumenter(autodoc.ClassDocumenter): _strip_re = re.compile( r"\s*Bases:\s*:py:class:" - r"`([^`]*?\bpybind11_[A-Za-z0-9_\.]+|object)`\s*") + r"`([^`]*?\bpybind11_[A-Za-z0-9_\.]+|object)`\s*" + ) def add_line(self, line: str, source: str, *lineno: int): if self._strip_re.fullmatch(line): diff --git a/python/src/nuri/fmt/fmt_module.cpp b/python/src/nuri/fmt/fmt_module.cpp index 5d07b2c3..27463e7d 100644 --- a/python/src/nuri/fmt/fmt_module.cpp +++ b/python/src/nuri/fmt/fmt_module.cpp @@ -22,6 +22,7 @@ #include #include +#include "nuri/algo/guess.h" #include "nuri/core/molecule.h" #include "nuri/fmt/base.h" #include "nuri/fmt/mol2.h" @@ -38,7 +39,8 @@ class PyMoleculeReader { public: PyMoleculeReader(std::unique_ptr is, std::string_view fmt, bool sanitize, bool skip_on_error) - : stream_(std::move(is)), skip_on_error_(skip_on_error) { + : stream_(std::move(is)), sanitize_(sanitize), + skip_on_error_(skip_on_error) { if (!*stream_) throw py::value_error(absl::StrCat("Invalid stream object")); @@ -51,7 +53,7 @@ class PyMoleculeReader { if (!reader_) throw py::value_error(absl::StrCat("Failed to create reader for ", fmt)); - sanitize_ = sanitize && !reader_->sanitized(); + guess_ = sanitize && !reader_->bond_valid(); } auto next() { @@ -78,7 +80,16 @@ class PyMoleculeReader { continue; } - if (sanitize_ && !MoleculeSanitizer(mol).sanitize_all()) { + if (guess_ && mol.is_3d()) { + if (!internal::guess_update_subs(mol)) { + log_or_throw("Failed to guess molecule atom/bond types"); + continue; + } + } else if (sanitize_ && !MoleculeSanitizer(mol).sanitize_all()) { + ABSL_LOG_IF(WARNING, guess_ && !mol.is_3d()) + << "Reader might produce molecules with invalid bonds, but the " + "molecule is missing 3D coordinates; guessing is disabled."; + log_or_throw("Failed to sanitize molecule"); continue; } @@ -102,6 +113,7 @@ class PyMoleculeReader { std::vector block_; bool sanitize_; bool skip_on_error_; + bool guess_; }; template @@ -146,9 +158,10 @@ Read a molecule from a file. :param fmt: The format of the file. :param path: The path to the file. -:param sanitize: Whether to sanitize the produced molecule. Note that if the - underlying reader produces a sanitized molecule, this option is ignored and - the molecule is always sanitized. +:param sanitize: Whether to sanitize the produced molecule. For formats that is + known to produce molecules with insufficient bond information (e.g. PDB), this + option will trigger guessing based on the 3D coordinates + (:func:`nuri.algo.guess_everything()`). :param skip_on_error: Whether to skip a molecule if an error occurs, instead of raising an exception. :raises OSError: If any file-related error occurs. @@ -171,9 +184,10 @@ Read a molecule from string. :param fmt: The format of the file. :param data: The string to read. -:param sanitize: Whether to sanitize the produced molecule. Note that if the - underlying reader produces a sanitized molecule, this option is ignored and - the molecule is always sanitized. +:param sanitize: Whether to sanitize the produced molecule. For formats that is + known to produce molecules with insufficient bond information (e.g. PDB), this + option will trigger guessing based on the 3D coordinates + (:func:`nuri.algo.guess_everything()`). :param skip_on_error: Whether to skip a molecule if an error occurs, instead of raising an exception. :raises ValueError: If the format is unknown or sanitization fails, unless diff --git a/src/algo/guess/3d.cpp b/src/algo/guess/3d.cpp index d3922134..51d8f7c8 100644 --- a/src/algo/guess/3d.cpp +++ b/src/algo/guess/3d.cpp @@ -2170,4 +2170,19 @@ bool guess_all_types(Molecule &mol, int conf) { reset_bonds(mol); return guess_types_common(mol, mol.confs()[conf]); } + +namespace internal { + bool guess_update_subs(Molecule &mol) { + bool ret; + { + auto mut = mol.mutator(); + ret = guess_everything(mut); + } + + for (auto &sub: mol.substructures()) + sub.refresh_bonds(); + + return ret; + } +} // namespace internal } // namespace nuri diff --git a/src/fmt/pdb.cpp b/src/fmt/pdb.cpp index b542791b..fc29455d 100644 --- a/src/fmt/pdb.cpp +++ b/src/fmt/pdb.cpp @@ -19,9 +19,7 @@ #include #include -#include #include -#include #include #include #include @@ -34,7 +32,6 @@ #include #include "nuri/eigen_config.h" -#include "nuri/algo/guess.h" #include "nuri/core/element.h" #include "nuri/core/graph.h" #include "nuri/core/molecule.h" @@ -487,9 +484,9 @@ std::string read_model_line(std::string_view line) { line.remove_prefix(5); return std::string(absl::StripAsciiWhitespace(line)); } -} // namespace -namespace { +// TODO(jnooree): move to separate file and provide a more general interface +/* struct PDBAtomInfoTemplate { std::string_view name; int atomic_number; @@ -1563,6 +1560,7 @@ const absl::flat_hash_map kAAData { } }, }; // clang-format on +*/ std::pair parse_serial(std::string_view line) { int serial; @@ -1913,11 +1911,6 @@ void read_connect_line(std::string_view line, const int src, continue; } - if (mut.mol().atom(*dst).data().element().type() == Element::Type::kMetal) { - ABSL_LOG(INFO) << "ignoring CONECT record to metal atom " << serial; - continue; - } - auto [_, added] = mut.add_bond(src, *dst, BondData(constants::kSingleBond)); if (added) ABSL_VLOG(1) << "Added bond " << src << " -> " << *dst << " from CONECT"; @@ -1954,11 +1947,6 @@ void read_connect_section(Iterator &it, const Iterator end, continue; } - if (mut.mol().atom(*idx).data().element().type() == Element::Type::kMetal) { - ABSL_LOG(INFO) << "ignoring CONECT record to metal atom " << serial; - continue; - } - read_connect_line(line, *idx, mut, serial_to_idx); } } @@ -2065,9 +2053,10 @@ void update_confs(Molecule &mol, const std::vector &atom_data) { constexpr int kChainIdx = 0; -void update_substructures(Molecule &mol, std::vector &subs, - PDBResidueData &residue_data, +void update_substructures(Molecule &mol, PDBResidueData &residue_data, const std::vector &atom_data) { + auto &subs = mol.substructures(); + std::vector>> chains; for (int i = 0; i < atom_data.size(); ++i) { @@ -2088,14 +2077,14 @@ void update_substructures(Molecule &mol, std::vector &subs, auto sit = subs.begin(); for (int i = 0; i < residue_data.size(); ++i, ++sit) { - auto &[id, resname, idxs] = residue_data[i]; + auto &data = residue_data[i]; - sit->update(std::move(idxs), {}); - sit->name() = std::string(resname); - sit->set_id(id.seqnum); - sit->add_prop("chain", std::string(1, id.chain)); - if (id.icode != ' ') - sit->add_prop("icode", std::string(1, id.icode)); + sit->update(std::move(data.idxs), {}); + sit->name() = std::string(data.resname); + sit->set_id(data.id.seqnum); + sit->add_prop("chain", std::string(1, data.id.chain)); + if (data.id.icode != ' ') + sit->add_prop("icode", std::string(1, data.id.icode)); ABSL_DCHECK(sit->props()[kChainIdx].first == "chain"); } @@ -2107,81 +2096,6 @@ void update_substructures(Molecule &mol, std::vector &subs, sit->set_id(i); } } - -void update_std_atoms(Molecule &mol, const std::vector &subs) { - for (const Substructure &sub: subs) { - if (sub.category() != SubstructCategory::kResidue) - continue; - - std::string_view resname = sub.name(); - auto ait = kAAData.find(resname); - if (ait == kAAData.end()) - continue; - - const AminoAcid &aa = ait->second; - aa.update_atoms(mol, sub.atom_ids()); - } -} - -// Typically 1.32 Angstroms -constexpr double kMaxPepBondLSq = 2.0 * 2.0; - -void add_inter_res_bond(MoleculeMutator &mut, const int prev_cterm, - const int nterm) { - double lsq = mut.mol().distsq(prev_cterm, nterm); - if (lsq > kMaxPepBondLSq) { - ABSL_VLOG(1) << "Peptide bond between atoms " << prev_cterm << " and " - << nterm << " is too long (" << std::sqrt(lsq) - << " Angstroms), missing residues?"; - return; - } - - auto [bit, added] = mut.add_bond(prev_cterm, nterm, {}); - if (!added) { - ABSL_LOG(INFO) << "Duplicate peptide bond between atoms " << prev_cterm - << " and " << nterm << "; ignoring"; - return; - } - - BondData &bd = bit->data(); - bd.set_conjugated(true); - - AtomData &ad = mut.mol().atom(nterm).data(); - ad.set_hybridization(constants::kSP2); - ad.set_conjugated(true); - ad.set_implicit_hydrogens(nonnegative(ad.implicit_hydrogens() - 1)); -} - -void add_std_bonds(MoleculeMutator &mut, - const std::vector &subs) { - char prev_chain = '\0'; - int prev_cterm = -1; - - for (const Substructure &sub: subs) { - if (sub.category() != SubstructCategory::kResidue) - continue; - - std::string_view resname = sub.name(); - auto ait = kAAData.find(resname); - if (ait == kAAData.end()) { - prev_cterm = -1; - continue; - } - - const AminoAcid &aa = ait->second; - auto [nterm, cterm] = aa.add_bonds(mut, sub.atom_ids()); - const char chain = sub.props()[kChainIdx].second[0]; - ABSL_DCHECK(sub.props()[kChainIdx].first == "chain"); - - if (prev_chain == chain && prev_cterm >= 0 && nterm >= 0) { - ABSL_DCHECK(prev_cterm != nterm); - add_inter_res_bond(mut, prev_cterm, nterm); - } - - prev_chain = chain; - prev_cterm = cterm; - } -} } // namespace Molecule read_pdb(const std::vector &pdb) { @@ -2252,37 +2166,19 @@ Molecule read_pdb(const std::vector &pdb) { return mol; } - std::vector subs; - { auto mut = mol.mutator(); for (PDBAtomData &pd: atom_data) mut.add_atom(pd.to_standard()); - update_substructures(mol, subs, residue_data, atom_data); update_confs(mol, atom_data); - bool guess_ok = guess_everything(mut); - if (!guess_ok) { - ABSL_LOG(WARNING) << "Failed to guess atom/bond types; the resulting " - "molecule might be invalid"; - mut.clear_bonds(); - update_std_atoms(mol, subs); - add_std_bonds(mut, subs); - - int connect_bonds_start = mol.num_bonds(); - read_connect_section(it, end, mut, serial_to_idx); - if (mol.num_bonds() != connect_bonds_start) - remove_hbonds(mut); - - // TODO(jnooree): handle stereochemistry correctly - } + read_connect_section(it, end, mut, serial_to_idx); + if (!mol.bond_empty()) + remove_hbonds(mut); } - for (Substructure &sub: subs) - sub.refresh_bonds(); - - mol.substructures() = std::move(subs); + update_substructures(mol, residue_data, atom_data); return mol; } diff --git a/test/algo/guess_test.cpp b/test/algo/guess_test.cpp index 65079562..8bf23b99 100644 --- a/test/algo/guess_test.cpp +++ b/test/algo/guess_test.cpp @@ -1058,6 +1058,8 @@ TEST(GuessSelectedMolecules, GH358) { int i = 0; for (; i < smiles_answers.size() && reader.getnext(blk); ++i) { Molecule mol = reader.parse(blk); + EXPECT_TRUE(internal::guess_update_subs(mol)); + std::string smi; write_smiles(smi, mol); absl::StripAsciiWhitespace(&smi); @@ -1098,6 +1100,8 @@ TEST(GuessSelectedMolecules, GH367) { int i = 0; for (; i < smiles_answers.size() && reader.getnext(blk); ++i) { Molecule mol = reader.parse(blk); + EXPECT_TRUE(internal::guess_update_subs(mol)); + std::string smi; write_smiles(smi, mol); absl::StripAsciiWhitespace(&smi); diff --git a/test/fmt/base_test.cpp b/test/fmt/base_test.cpp index b85342ea..80880d05 100644 --- a/test/fmt/base_test.cpp +++ b/test/fmt/base_test.cpp @@ -117,7 +117,7 @@ class DummyReader: public MoleculeReader { return {}; } - bool sanitized() const override { return true; } + bool bond_valid() const override { return true; } }; class DummyReaderFactory: public DefaultReaderFactoryImpl { }; diff --git a/test/fmt/pdb_test.cpp b/test/fmt/pdb_test.cpp index 630aa8df..f74d77b3 100644 --- a/test/fmt/pdb_test.cpp +++ b/test/fmt/pdb_test.cpp @@ -16,17 +16,25 @@ #include "fmt_test_common.h" #include "test_utils.h" +#include "nuri/algo/guess.h" #include "nuri/core/molecule.h" #include "nuri/fmt/base.h" namespace nuri { namespace { -using PDBTest = internal::FileFormatTest; +class PDBTest: public internal::FileFormatTest { + using Parent = internal::FileFormatTest; + +public: + bool advance_and_guess() { + return advance() && internal::guess_update_subs(mol()); + } +}; TEST_F(PDBTest, BasicParsing) { set_test_file("dummy.pdb"); - ASSERT_TRUE(advance()); + ASSERT_TRUE(advance_and_guess()); EXPECT_EQ(mol().num_atoms(), 88); EXPECT_EQ(mol().num_bonds(), 87); @@ -57,14 +65,10 @@ TEST_F(PDBTest, HandleInvalidCoordinates) { set_test_file("invalid.pdb"); ASSERT_TRUE(advance()); + EXPECT_NO_FATAL_FAILURE( + static_cast(internal::guess_update_subs(mol()))); EXPECT_EQ(mol().num_atoms(), 90); - EXPECT_EQ(mol().num_bonds(), 89); - - // Peptide + H2O - EXPECT_EQ(mol().num_fragments(), 2); - // Ring from PHE - EXPECT_EQ(mol().ring_groups().size(), 1); // 11 residues + 1 chain EXPECT_EQ(mol().num_substructures(), 12); @@ -79,17 +83,12 @@ TEST_F(PDBTest, HandleInvalidCoordinates) { auto atom = mol().atom(25); EXPECT_EQ(atom.data().get_name(), "N"); EXPECT_EQ(atom.data().atomic_number(), 7); - EXPECT_EQ(atom.data().hybridization(), constants::kSP2); - EXPECT_TRUE(atom.data().is_conjugated()); - - EXPECT_NE(mol().find_bond(87, 88), mol().bond_end()); - EXPECT_NE(mol().find_bond(87, 89), mol().bond_end()); } TEST_F(PDBTest, HandleCleanPDB) { set_test_file("1ubq.pdb"); - ASSERT_TRUE(advance()); + ASSERT_TRUE(advance_and_guess()); EXPECT_EQ(mol().name(), "1UBQ"); EXPECT_EQ(mol().num_atoms(), 660); @@ -114,7 +113,7 @@ TEST_F(PDBTest, HandleCleanPDB) { TEST_F(PDBTest, HandleMultipleModels) { set_test_file("3cye_part.pdb"); - ASSERT_TRUE(advance()); + ASSERT_TRUE(advance_and_guess()); EXPECT_EQ(mol().name(), "3CYE"); EXPECT_EQ(mol().num_atoms(), 55); @@ -134,7 +133,7 @@ TEST_F(PDBTest, HandleMultipleModels) { EXPECT_EQ(mol().get_substructure(0).name(), "VAL"); EXPECT_EQ(mol().get_substructure(0).num_atoms(), 7); - ASSERT_TRUE(advance()); + ASSERT_TRUE(advance_and_guess()); EXPECT_EQ(mol().name(), "3CYE"); EXPECT_EQ(mol().num_atoms(), 36); @@ -204,6 +203,7 @@ class PDB1alxTest: public testing::Test { MoleculeStream ms(reader); ASSERT_TRUE(ms.advance()); + EXPECT_TRUE(internal::guess_update_subs(ms.current())); mol_ = std::move(ms.current()); ASSERT_EQ(mol_.name(), "1ALX");