Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Apply subarrays for dense reads and writes #3263

Merged
merged 7 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions apis/python/src/tiledbsoma/_flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,9 @@
NEW_SHAPE_FEATURE_FLAG_ENABLED = os.getenv("SOMA_PY_NEW_SHAPE") != "false"

DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN = clib.embedded_version_triple() >= (2, 27, 0)

# Temporary for # https://github.com/single-cell-data/TileDB-SOMA/issues/2407:
# this allows testing dense + current domain on the same machine without
# having to switch core builds (or switch machines).
if os.getenv("SOMA_IGNORE_CORE_2_27") is not None:
DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN = False
8 changes: 8 additions & 0 deletions apis/r/R/Init.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,15 @@
.pkgenv[["use_current_domain_transitional_internal_only"]]
}

# Temporary for # https://github.com/single-cell-data/TileDB-SOMA/issues/2407.
# Once core 2.27 is released and we depend on it, this can go away.
.dense_arrays_can_have_current_domain <- function() {
# This allows testing dense + current domain on the same machine without
# having to switch core builds (or switch machines).
if (Sys.getenv("SOMA_IGNORE_CORE_2_27") != "") {
return(FALSE)
}

triple <- tiledb_embedded_version()
return(triple[[1]] >= 2 && triple[[2]] >= 27)
}
Expand Down
158 changes: 129 additions & 29 deletions libtiledbsoma/src/soma/managed_query.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void ManagedQuery::reset() {
query_ = std::make_unique<Query>(*ctx_, *array_);
subarray_ = std::make_unique<Subarray>(*ctx_, *array_);

subarray_range_set_ = false;
subarray_range_set_ = {};
subarray_range_empty_ = {};
columns_.clear();
results_complete_ = true;
Expand Down Expand Up @@ -102,36 +102,10 @@ void ManagedQuery::setup_read() {

auto schema = array_->schema();

_fill_in_subarrays_if_dense(true);

// If the query is uninitialized, set the subarray for the query
if (status == Query::Status::UNINITIALIZED) {
// Dense array must have a subarray set. If the array is dense and no
// ranges have been set, add a range for the array's entire non-empty
// domain on dimension 0. In the case that the non-empty domain does not
// exist (when the array has not been written to yet), use dimension 0's
// full domain
if (schema.array_type() == TILEDB_DENSE && !subarray_range_set_) {
// Check if the array has been written to by using the C API as
// there is no way to to check for an empty domain using the current
// CPP API
int32_t is_empty;
int64_t ned[2];
ctx_->handle_error(tiledb_array_get_non_empty_domain_from_index(
ctx_->ptr().get(), array_->ptr().get(), 0, &ned, &is_empty));

std::pair<int64_t, int64_t> array_shape;
if (is_empty == 1) {
array_shape = schema.domain().dimension(0).domain<int64_t>();
} else {
array_shape = std::make_pair(ned[0], ned[1]);
}

subarray_->add_range(0, array_shape.first, array_shape.second);
LOG_DEBUG(fmt::format(
"[ManagedQuery] Add full range to dense subarray = (0, {}, {})",
array_shape.first,
array_shape.second));
}

// Set the subarray for range slicing
query_->set_subarray(*subarray_);
}
Expand Down Expand Up @@ -162,6 +136,8 @@ void ManagedQuery::setup_read() {
}

void ManagedQuery::submit_write(bool sort_coords) {
_fill_in_subarrays_if_dense(false);

if (array_->schema().array_type() == TILEDB_DENSE) {
query_->set_subarray(*subarray_);
} else {
Expand Down Expand Up @@ -191,6 +167,130 @@ void ManagedQuery::submit_read() {
});
}

// Please see the header-file comments for context.
void ManagedQuery::_fill_in_subarrays_if_dense(bool is_read) {
LOG_TRACE("[ManagedQuery] _fill_in_subarrays enter");
// Don't do this on next-page etc.
if (query_->query_status() != Query::Status::UNINITIALIZED) {
LOG_TRACE("[ManagedQuery] _fill_in_subarrays exit: initialized");
return;
}
auto schema = array_->schema();

// Do this only for dense arrays.
if (schema.array_type() != TILEDB_DENSE) {
LOG_TRACE("[ManagedQuery] _fill_in_subarrays exit: non-dense");
return;
}

// Don't do this if the array doesn't have new shape AKA current domain.
auto current_domain = tiledb::ArraySchemaExperimental::current_domain(
*ctx_, schema);
if (current_domain.is_empty()) {
_fill_in_subarrays_if_dense_without_new_shape(is_read);
} else {
_fill_in_subarrays_if_dense_with_new_shape(current_domain);
}
LOG_TRACE("[ManagedQuery] _fill_in_subarrays exit");
}

void ManagedQuery::_fill_in_subarrays_if_dense_without_new_shape(bool is_read) {
LOG_TRACE(
"[ManagedQuery] _fill_in_subarrays_if_dense_without_new_shape enter");
// Dense array must have a subarray set for read. If the array is dense and
// no ranges have been set, add a range for the array's entire non-empty
// domain on dimension 0. In the case that the non-empty domain does not
// exist (when the array has not been written to yet), use dimension 0's
// full domain
if (_has_any_subarray_range_set()) {
return;
}

std::pair<int64_t, int64_t> array_shape;
auto schema = array_->schema();

if (!is_read && subarray_->range_num(0) > 0) {
LOG_TRACE(
"[ManagedQuery] _fill_in_subarrays_if_dense_without_new_shape "
"range 0 is set");
return;
}

if (is_read) {
// Check if the array has been written to by using the C API as
// there is no way to to check for an empty domain using the current
// C++ API.
int32_t is_empty;
int64_t ned[2];
ctx_->handle_error(tiledb_array_get_non_empty_domain_from_index(
ctx_->ptr().get(), array_->ptr().get(), 0, &ned, &is_empty));

if (is_empty == 1) {
array_shape = schema.domain().dimension(0).domain<int64_t>();
} else {
array_shape = std::make_pair(ned[0], ned[1]);
}
} else {
// Non-empty d0main is not avaiable for access at write time.
array_shape = schema.domain().dimension(0).domain<int64_t>();
}

subarray_->add_range(0, array_shape.first, array_shape.second);
LOG_TRACE(fmt::format(
"[ManagedQuery] Add full range to dense subarray dim0 = ({}, {})",
array_shape.first,
array_shape.second));

// Set the subarray for range slicing
query_->set_subarray(*subarray_);
}

void ManagedQuery::_fill_in_subarrays_if_dense_with_new_shape(
const CurrentDomain& current_domain) {
LOG_TRACE(
"[ManagedQuery] _fill_in_subarrays_if_dense_with_new_shape enter");
if (current_domain.type() != TILEDB_NDRECTANGLE) {
throw TileDBSOMAError("found non-rectangle current-domain type");
}
NDRectangle ndrect = current_domain.ndrectangle();

// Loop over dims and apply subarray ranges if not already done by the
// caller.
auto schema = array_->schema();
for (const auto& dim : schema.domain().dimensions()) {
std::string dim_name = dim.name();
if (subarray_range_set_[dim_name]) {
LOG_TRACE(fmt::format(
"[ManagedQuery] _fill_in_subarrays continue {}", dim_name));
continue;
}

// Dense arrays are (as of this writing in 1.15.0) all DenseNDArray.
// Per the spec DenseNDArray must only have dims named
// soma_dim_{i} with i=0,1,2,...,n-1, of type int64.
if (dim_name.rfind("soma_dim_", 0) != 0) {
throw TileDBSOMAError(fmt::format(
"found dense array with unexpected dim name {}", dim_name));
}
if (dim.type() != TILEDB_INT64) {
throw TileDBSOMAError(fmt::format(
"expected dense arrays to have int64 dims; got {} for {}",
tiledb::impl::to_str(dim.type()),
dim_name));
}

std::array<int64_t, 2> lo_hi_arr = ndrect.range<int64_t>(dim_name);
std::pair<int64_t, int64_t> lo_hi_pair(lo_hi_arr[0], lo_hi_arr[1]);
LOG_TRACE(fmt::format(
"[ManagedQuery] _fill_in_subarrays_if_dense_with_new_shape dim "
"name {} select ({}, {})",
dim_name,
lo_hi_pair.first,
lo_hi_pair.second));
select_ranges(dim_name, std::vector({lo_hi_pair}));
}
}

std::shared_ptr<ArrayBuffers> ManagedQuery::results() {
if (is_empty_query()) {
return buffers_;
Expand Down
66 changes: 53 additions & 13 deletions libtiledbsoma/src/soma/managed_query.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ class ManagedQuery {
template <typename T>
void select_ranges(
const std::string& dim, const std::vector<std::pair<T, T>>& ranges) {
subarray_range_set_ = true;
subarray_range_set_[dim] = true;
subarray_range_empty_[dim] = true;
for (auto& [start, stop] : ranges) {
subarray_->add_range(dim, start, stop);
Expand All @@ -168,7 +168,7 @@ class ManagedQuery {
*/
template <typename T>
void select_points(const std::string& dim, const std::vector<T>& points) {
subarray_range_set_ = true;
subarray_range_set_[dim] = true;
subarray_range_empty_[dim] = true;
for (auto& point : points) {
subarray_->add_range(dim, point, point);
Expand All @@ -185,7 +185,7 @@ class ManagedQuery {
*/
template <typename T>
void select_points(const std::string& dim, const tcb::span<T> points) {
subarray_range_set_ = true;
subarray_range_set_[dim] = true;
subarray_range_empty_[dim] = true;
for (auto& point : points) {
subarray_->add_range(dim, point, point);
Expand All @@ -203,7 +203,7 @@ class ManagedQuery {
template <typename T>
void select_point(const std::string& dim, const T& point) {
subarray_->add_range(dim, point, point);
subarray_range_set_ = true;
subarray_range_set_[dim] = true;
subarray_range_empty_[dim] = false;
}

Expand Down Expand Up @@ -383,14 +383,7 @@ class ManagedQuery {
* @return true if the query contains only empty ranges.
*/
bool is_empty_query() {
bool has_empty = false;
for (auto subdim : subarray_range_empty_) {
if (subdim.second == true) {
has_empty = true;
break;
}
}
return subarray_range_set_ && has_empty;
return _has_any_empty_range() && _has_any_subarray_range_set();
}

/**
Expand All @@ -414,6 +407,53 @@ class ManagedQuery {
*/
void check_column_name(const std::string& name);

// Helper for is_empty_query
bool _has_any_empty_range() {
for (auto subdim : subarray_range_empty_) {
if (subdim.second == true) {
return true;
}
}
return false;
}

// Helper for is_empty_query
bool _has_any_subarray_range_set() {
for (auto subdim : subarray_range_set_) {
if (subdim.second == true) {
return true;
}
}
return false;
}

/**
* This handles a few internals.
*
* One is that a dense array must have _at least one_
* dim's subarray set for a read query. Without that, reads fail immediately
* with the unambiguous
*
* DenseReader: Cannot initialize reader; Dense reads must have a subarray
* set
*
* The other is a combination of several things. Firstly, is current-domain
* support which we have for sparse arrays as of core 2.26, and for dense as
* of 2.27. Secondly, without current-domain support, we had small domains; with
* it, we have huge core domains (2^63-ish) which are immutable, and
* small current domains which are upward-mutable. (The soma domain and
* maxdomain, respectively, are core current domain and domain.) Thirdly,
* if a query doesn't have a subarray set on any
* particular dim, core will use the core domain on that dim. That was fine
* when core domains were small; not fine now that they are huge. In this
* routine, if the array is dense, for each dim without a subarray set,
* we set it to match the soma domain. This guarantees correct behavior.
*/
void _fill_in_subarrays_if_dense(bool is_read);
void _fill_in_subarrays_if_dense_with_new_shape(
const CurrentDomain& current_domain);
void _fill_in_subarrays_if_dense_without_new_shape(bool is_read);

// TileDB array being queried.
std::shared_ptr<Array> array_;

Expand All @@ -433,7 +473,7 @@ class ManagedQuery {
std::unique_ptr<Subarray> subarray_;

// True if a range has been added to the subarray
bool subarray_range_set_ = false;
std::map<std::string, bool> subarray_range_set_ = {};

// Map whether the dimension is empty (true) or not
std::map<std::string, bool> subarray_range_empty_ = {};
Expand Down
2 changes: 1 addition & 1 deletion libtiledbsoma/test/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ const int CORE_DOMAIN_MAX = 2147483646;
static std::unique_ptr<ArrowArray> _create_index_cols_info_array(
const std::vector<DimInfo>& dim_infos);

// Core PRP: https://github.com/TileDB-Inc/TileDB/pull/5303
// Core PR: https://github.com/TileDB-Inc/TileDB/pull/5303
bool have_dense_current_domain_support() {
auto vers = tiledbsoma::version::embedded_version_triple();
return std::get<0>(vers) >= 2 && std::get<1>(vers) >= 27;
Expand Down
3 changes: 0 additions & 3 deletions libtiledbsoma/test/unit_soma_sparse_ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -340,8 +340,6 @@ TEST_CASE("SOMASparseNDArray: metadata", "[SOMASparseNDArray]") {
REQUIRE(snda->metadata_num() == 2);
}
}
void breakme() {
}

TEST_CASE(
"SOMASparseNDArray: can_tiledbsoma_upgrade_shape", "[SOMASparseNDArray]") {
Expand Down Expand Up @@ -389,7 +387,6 @@ TEST_CASE(
REQUIRE(dom.first == 0);
REQUIRE(dom.second == dim_max);

breakme();
std::vector<int64_t> newshape_wrong_dims({dim_max, 12});
std::vector<int64_t> newshape_too_big({dim_max + 10});
std::vector<int64_t> newshape_good({40});
Expand Down