Skip to content

Commit

Permalink
feat(fmt/mmcif): implement MMCif parser (#444)
Browse files Browse the repository at this point in the history
* perf(fmt/pdb): reserve one more for name

* feat(fmt/cif): allow default-construction of CifValue

* feat(fmt/mmcif): implement MMCif parser

* feat(fmt/mmcif): improve conversion failure log message

* feat(fmt/mmcif): test for null value before conversion

* feat(fmt/mmcif): set molecule name

* feat(fmt/mmcif): parse _struct_conn table

* test(fmt/mmcif): add basic parsing test

* fix(fmt/mmcif): fix alternate location handling

* feat(fmt/mmcif): improve log messages

* test(fmt/mmcif): add alternate location test

* fix(fmt/mmcif): return models in order of appearance

* test(fmt/mmcif): test for multiple models and altlocs

* refactor(fmt/pdb,mmcif): merge shared logic

* test(fmt/pdb,mmcif): add testcase for insertion code

* fix(fmt): cleanup headers
  • Loading branch information
jnooree authored Jan 3, 2025
2 parents 41b9985 + 80742b0 commit a577eec
Show file tree
Hide file tree
Showing 10 changed files with 4,600 additions and 145 deletions.
2 changes: 2 additions & 0 deletions include/nuri/fmt/cif.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ namespace internal {
kNull = kUnknown | kInapplicable, // any null value
};

CifValue(): type_(Type::kInapplicable) { }

CifValue(std::string_view value, internal::CifToken type): value_(value) {
if (type == internal::CifToken::kQuotedValue) {
type_ = Type::kString;
Expand Down
20 changes: 20 additions & 0 deletions include/nuri/fmt/mmcif.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
//
// Project NuriKit - Copyright 2025 SNU Compbio Lab.
// SPDX-License-Identifier: Apache-2.0
//

#ifndef NURI_FMT_MMCIF_H_
#define NURI_FMT_MMCIF_H_

/// @cond
#include <vector>
/// @endcond

#include "nuri/core/molecule.h"
#include "nuri/fmt/cif.h"

namespace nuri {
std::vector<Molecule> mmcif_read_next_block(CifParser &parser);
}

#endif /* NURI_FMT_MMCIF_H_ */
139 changes: 139 additions & 0 deletions src/fmt/fmt_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,150 @@
#ifndef NURI_FMT_FMT_INTERNAL_H_
#define NURI_FMT_FMT_INTERNAL_H_

#include <functional>
#include <iterator>
#include <string>
#include <string_view>
#include <type_traits>
#include <utility>
#include <vector>

#include <absl/log/absl_check.h>
#include <boost/spirit/home/x3.hpp>

#include "nuri/eigen_config.h"
#include "nuri/core/molecule.h"
#include "nuri/utils.h"

namespace nuri {
namespace internal {
template <auto AltlocExtractor, class AT, class CoordConverter>
void pdb_update_confs(Molecule &mol, const std::vector<AT> &atom_data,
const CoordConverter &coords) {
std::vector<std::string_view> s, t;
std::vector<std::string_view> *altlocs = &s, *buf = &t;

for (const AT &ad: atom_data) {
if (ad.data().size() <= 1)
continue;

buf->clear();
std::set_union(altlocs->cbegin(), altlocs->cend(),
make_transform_iterator<AltlocExtractor>(ad.data().begin()),
make_transform_iterator<AltlocExtractor>(ad.data().end()),
std::back_inserter(*buf));
std::swap(altlocs, buf);
}

if (altlocs->empty()) {
Matrix3Xd &conf = mol.confs().emplace_back(Matrix3Xd(3, atom_data.size()));
for (int i = 0; i < atom_data.size(); ++i)
coords(atom_data[i].first(), conf.col(i));
return;
}

ABSL_DCHECK(altlocs->size() > 1);

mol.confs().resize(altlocs->size());
for (int i = 0; i < altlocs->size(); ++i)
mol.confs()[i].resize(3, static_cast<int>(atom_data.size()));

for (int i = 0; i < atom_data.size(); ++i) {
const AT &ad = atom_data[i];

if (ad.data().size() == 1) {
Eigen::Ref<Vector3d> coord = mol.confs()[0].col(i);
coords(ad.first(), coord);
for (int j = 1; j < mol.confs().size(); ++j)
mol.confs()[j].col(i) = coord;
continue;
}

if (ad.data().size() == altlocs->size()) {
for (int j = 0; j < mol.confs().size(); ++j)
coords(ad.data()[j], mol.confs()[j].col(i));
continue;
}

Vector3d major_coord;
coords(ad.data()[ad.major()], major_coord);

int j = 0;
for (int k = 0; k < ad.data().size(); ++j) {
ABSL_DCHECK(j < mol.confs().size());

std::string_view altloc = (*altlocs)[j];
Eigen::Ref<Vector3d> coord = mol.confs()[j].col(i);

if (std::invoke(AltlocExtractor, ad.data()[k]) != altloc) {
coord = major_coord;
continue;
}

coords(ad.data()[k++], coord);
}

for (; j < mol.confs().size(); ++j)
mol.confs()[j].col(i) = major_coord;
}
}

constexpr int kChainIdx = 0;

template <class RT, class AT, class ChainAsSv, class ResidueMember,
class SeqMember, class ICodeAsSv,
std::enable_if_t<!std::is_reference_v<RT>, int> = 0>
// NOLINTNEXTLINE(*-missing-std-forward)
void pdb_update_substructs(Molecule &mol, RT &&residues,
const std::vector<AT> &atoms,
const ChainAsSv &chain_sv,
const ResidueMember &residue_member,
const SeqMember &seq_member,
const ICodeAsSv &icode_sv) {
auto &subs = mol.substructures();

std::vector<std::pair<std::string_view, std::vector<int>>> chains;

for (int i = 0; i < atoms.size(); ++i) {
const auto &id = atoms[i].first().id().res;

auto [cit, _] = insert_sorted(chains, { std::invoke(chain_sv, id), {} },
[](const auto &a, const auto &b) {
return a.first < b.first;
});
cit->second.push_back(i);
}

subs.reserve(residues.size() + chains.size());
subs.resize(residues.size(), mol.substructure(SubstructCategory::kResidue));
subs.resize(residues.size() + chains.size(),
mol.substructure(SubstructCategory::kChain));

auto sit = subs.begin();
for (int i = 0; i < residues.size(); ++i, ++sit) {
auto &data = residues[i];

sit->update(std::move(data.idxs), {});
sit->name() = std::invoke(residue_member, data);
sit->set_id(std::invoke(seq_member, data.id));
sit->add_prop("chain", std::invoke(chain_sv, data.id));

std::string_view icode = std::invoke(icode_sv, data.id);
if (!icode.empty())
sit->add_prop("icode", icode);

ABSL_DCHECK(sit->props()[kChainIdx].first == "chain");
}

for (int i = 0; i < chains.size(); ++i, ++sit) {
auto &chain = chains[i];
sit->update(std::move(chain.second), {});
sit->name() = chain.first;
sit->set_id(i);
}
}
} // namespace internal

// NOLINTNEXTLINE(google-build-namespaces)
namespace {
// NOLINTBEGIN(readability-identifier-naming,*-unused-const-variable)
Expand Down
Loading

0 comments on commit a577eec

Please sign in to comment.