Skip to content

Commit

Permalink
Merge pull request #437 from seoklab/fmt/pdb/no-guess
Browse files Browse the repository at this point in the history
feat(fmt/pdb): don't guess in reader
  • Loading branch information
jnooree authored Dec 18, 2024
2 parents d1586db + 71c2cfc commit 1d4ec7e
Show file tree
Hide file tree
Showing 13 changed files with 96 additions and 159 deletions.
13 changes: 10 additions & 3 deletions include/nuri/algo/guess.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#ifndef NURI_ALGO_GUESS_H_
#define NURI_ALGO_GUESS_H_

#include <absl/base/attributes.h>

#include "nuri/core/molecule.h"

namespace nuri {
Expand All @@ -26,8 +28,9 @@ constexpr double kDefaultThreshold = 0.5;
* If connectivity information is already present and is correct, consider using
* guess_all_types().
*/
extern bool guess_everything(MoleculeMutator &mut, int conf = 0,
double threshold = kDefaultThreshold);
ABSL_MUST_USE_RESULT extern bool
guess_everything(MoleculeMutator &mut, int conf = 0,
double threshold = kDefaultThreshold);

/**
* @brief Guess connectivity information of a molecule.
Expand All @@ -53,7 +56,7 @@ extern void guess_connectivity(MoleculeMutator &mut, int conf = 0,
* and all atom/bond types and implicit hydrogen counts are incorrect. The
* information present in the molecule could be overwritten by this function.
*/
extern bool guess_all_types(Molecule &mol, int conf = 0);
ABSL_MUST_USE_RESULT extern bool guess_all_types(Molecule &mol, int conf = 0);

/**
* @brief Guess formal charges of a molecule.
Expand Down Expand Up @@ -84,6 +87,10 @@ extern void guess_hydrogens_2d(Molecule &mol);
* guess_hydrogens() in sequence.
*/
extern void guess_fcharge_hydrogens_2d(Molecule &mol);

namespace internal {
ABSL_MUST_USE_RESULT extern bool guess_update_subs(Molecule &mol);
} // namespace internal
} // namespace nuri

#endif /* NURI_ALGO_GUESS_H_ */
2 changes: 1 addition & 1 deletion include/nuri/fmt/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class MoleculeReader {
*/
virtual Molecule parse(const std::vector<std::string> &block) const = 0;

virtual bool sanitized() const = 0;
virtual bool bond_valid() const = 0;

MoleculeStream<MoleculeReader> stream() { return { *this }; }
};
Expand Down
2 changes: 1 addition & 1 deletion include/nuri/fmt/mol2.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class Mol2Reader: public DefaultReaderImpl<read_mol2> {

bool getnext(std::vector<std::string> &block) override;

bool sanitized() const override { return false; }
bool bond_valid() const override { return true; }

private:
bool read_mol_header_ = false;
Expand Down
2 changes: 1 addition & 1 deletion include/nuri/fmt/pdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class PDBReader: public DefaultReaderImpl<read_pdb> {

bool getnext(std::vector<std::string> &block) override;

bool sanitized() const override { return true; }
bool bond_valid() const override { return false; }

private:
std::vector<std::string> header_;
Expand Down
2 changes: 1 addition & 1 deletion include/nuri/fmt/sdf.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class SDFReader: public DefaultReaderImpl<read_sdf> {

bool getnext(std::vector<std::string> &block) override;

bool sanitized() const override { return false; }
bool bond_valid() const override { return true; }
};

class SDFReaderFactory: public DefaultReaderFactoryImpl<SDFReader> {
Expand Down
2 changes: 1 addition & 1 deletion include/nuri/fmt/smiles.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class SmilesReader: public DefaultReaderImpl<read_smiles> {

bool getnext(std::vector<std::string> &block) override;

bool sanitized() const override { return false; }
bool bond_valid() const override { return true; }
};

class SmilesReaderFactory: public DefaultReaderFactoryImpl<SmilesReader> {
Expand Down
9 changes: 5 additions & 4 deletions python/docs/conf.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html

import doctest
import re

import doctest
from sphinx.ext import autodoc

# -- Project information -----------------------------------------------------
Expand Down Expand Up @@ -61,10 +61,10 @@ html_static_path = ["@CMAKE_CURRENT_LIST_DIR@/_static"]
if "@DOXYGEN_OUTPUT_DIR@":
html_extra_path = ["@DOXYGEN_OUTPUT_DIR@/html"]
html_css_files = [
"css/rtd-property.css" # workaround https://github.com/readthedocs/sphinx_rtd_theme/issues/1301
"css/rtd-property.css" # workaround https://github.com/readthedocs/sphinx_rtd_theme/issues/1301
]
html_theme_options = {
'navigation_depth': -1,
"navigation_depth": -1,
}

intersphinx_mapping = {
Expand All @@ -83,7 +83,8 @@ doctest_global_setup = "import nuri"
class UnneededBaseStripDocumenter(autodoc.ClassDocumenter):
_strip_re = re.compile(
r"\s*Bases:\s*:py:class:"
r"`([^`]*?\bpybind11_[A-Za-z0-9_\.]+|object)`\s*")
r"`([^`]*?\bpybind11_[A-Za-z0-9_\.]+|object)`\s*"
)

def add_line(self, line: str, source: str, *lineno: int):
if self._strip_re.fullmatch(line):
Expand Down
32 changes: 23 additions & 9 deletions python/src/nuri/fmt/fmt_module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <pybind11/pytypes.h>
#include <pybind11/stl/filesystem.h>

#include "nuri/algo/guess.h"
#include "nuri/core/molecule.h"
#include "nuri/fmt/base.h"
#include "nuri/fmt/mol2.h"
Expand All @@ -38,7 +39,8 @@ class PyMoleculeReader {
public:
PyMoleculeReader(std::unique_ptr<std::istream> is, std::string_view fmt,
bool sanitize, bool skip_on_error)
: stream_(std::move(is)), skip_on_error_(skip_on_error) {
: stream_(std::move(is)), sanitize_(sanitize),
skip_on_error_(skip_on_error) {
if (!*stream_)
throw py::value_error(absl::StrCat("Invalid stream object"));

Expand All @@ -51,7 +53,7 @@ class PyMoleculeReader {
if (!reader_)
throw py::value_error(absl::StrCat("Failed to create reader for ", fmt));

sanitize_ = sanitize && !reader_->sanitized();
guess_ = sanitize && !reader_->bond_valid();
}

auto next() {
Expand All @@ -78,7 +80,16 @@ class PyMoleculeReader {
continue;
}

if (sanitize_ && !MoleculeSanitizer(mol).sanitize_all()) {
if (guess_ && mol.is_3d()) {
if (!internal::guess_update_subs(mol)) {
log_or_throw("Failed to guess molecule atom/bond types");
continue;
}
} else if (sanitize_ && !MoleculeSanitizer(mol).sanitize_all()) {
ABSL_LOG_IF(WARNING, guess_ && !mol.is_3d())
<< "Reader might produce molecules with invalid bonds, but the "
"molecule is missing 3D coordinates; guessing is disabled.";

log_or_throw("Failed to sanitize molecule");
continue;
}
Expand All @@ -102,6 +113,7 @@ class PyMoleculeReader {
std::vector<std::string> block_;
bool sanitize_;
bool skip_on_error_;
bool guess_;
};

template <class F, class... Args>
Expand Down Expand Up @@ -146,9 +158,10 @@ Read a molecule from a file.
:param fmt: The format of the file.
:param path: The path to the file.
:param sanitize: Whether to sanitize the produced molecule. Note that if the
underlying reader produces a sanitized molecule, this option is ignored and
the molecule is always sanitized.
:param sanitize: Whether to sanitize the produced molecule. For formats that is
known to produce molecules with insufficient bond information (e.g. PDB), this
option will trigger guessing based on the 3D coordinates
(:func:`nuri.algo.guess_everything()`).
:param skip_on_error: Whether to skip a molecule if an error occurs, instead of
raising an exception.
:raises OSError: If any file-related error occurs.
Expand All @@ -171,9 +184,10 @@ Read a molecule from string.
:param fmt: The format of the file.
:param data: The string to read.
:param sanitize: Whether to sanitize the produced molecule. Note that if the
underlying reader produces a sanitized molecule, this option is ignored and
the molecule is always sanitized.
:param sanitize: Whether to sanitize the produced molecule. For formats that is
known to produce molecules with insufficient bond information (e.g. PDB), this
option will trigger guessing based on the 3D coordinates
(:func:`nuri.algo.guess_everything()`).
:param skip_on_error: Whether to skip a molecule if an error occurs, instead of
raising an exception.
:raises ValueError: If the format is unknown or sanitization fails, unless
Expand Down
15 changes: 15 additions & 0 deletions src/algo/guess/3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2170,4 +2170,19 @@ bool guess_all_types(Molecule &mol, int conf) {
reset_bonds(mol);
return guess_types_common(mol, mol.confs()[conf]);
}

namespace internal {
bool guess_update_subs(Molecule &mol) {
bool ret;
{
auto mut = mol.mutator();
ret = guess_everything(mut);
}

for (auto &sub: mol.substructures())
sub.refresh_bonds();

return ret;
}
} // namespace internal
} // namespace nuri
Loading

0 comments on commit 1d4ec7e

Please sign in to comment.