From 3224740c4c371512154affb7c5ae60a74bbebb6c Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos <38084549+XanthosXanthopoulos@users.noreply.github.com> Date: Tue, 28 Jan 2025 17:40:12 +0200 Subject: [PATCH] [c++] Integrate `SOMAColumn`: Update domain accessors inside `SOMAArray`, part 2 (#3407) * SOMAColumn abstract class definition * Remove fmt::format * Remove unneeded methods and member variables * Add concrete class wrapper for TileDB dimension * Add minimal testing for dimensions * Replace string_view with string when returning column name, add current domain checks, replace vector with span when selecting points * Add concrete class wrapper for TileDB attribute * Update CMake files * Add minimal testing for dimensions * Misc fixes * Add read test case * Remove current_domain flag * Do not export soma column [skip ci] * Migrate array creation to SOMAColumn * Misc fixes * [c++] SOMAColumn serialization/deserialization (#3599) * Add minimal testing for dimensions * Add minimal testing for dimensions * Add read test case * Remove current_domain flag * Do not export soma column [skip ci] * Replace string_view with string when returning column name, add current domain checks, replace vector with span when selecting points * Add serialization/deserelization methods * Serialize SOMAColumn on schema generation * Update unit tests * Generate columns on array open * Add deserialization and default initialization on array open * Write SOMAColumn metadata if array is open in `write` mode * Write metadata directly to TileDB array * Fix error in tests after rebase * Handle addition and deletion of attributes * Fix R tests * [c++] Make `SOMAColumn` metadata required only for `GeometryDataframe` (#3621) * Make SOMAColumn metadata only required by GeometryDataframe * Update tests * Fill SOMAColumn info on array open * MIgrate domain access methods to use SOMAColumns * Add optional non empty domain method * Replace optional non empty domain with the SOMAColumn implementation, update python bindings * Add template-specialization guards * Remove unsupported dimension datatypes * Update old version of `fill_metadata_cache` * Filter SOMAColumns when iterating to construct the domain * Fix serialized columns order * log type [skip ci] * Specify LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION for clang --- apis/python/setup.py | 5 + apis/python/src/tiledbsoma/soma_array.cc | 6 +- libtiledbsoma/CMakeLists.txt | 4 +- libtiledbsoma/src/soma/soma_array.cc | 152 ++++-------- libtiledbsoma/src/soma/soma_array.h | 173 ++------------ libtiledbsoma/src/soma/soma_attribute.cc | 10 +- libtiledbsoma/src/soma/soma_attribute.h | 3 + libtiledbsoma/src/soma/soma_column.cc | 29 ++- libtiledbsoma/src/soma/soma_column.h | 43 +++- libtiledbsoma/src/soma/soma_dimension.cc | 225 +++++++++++++++--- libtiledbsoma/src/soma/soma_dimension.h | 3 + .../src/soma/soma_geometry_column.cc | 46 ++++ libtiledbsoma/src/soma/soma_geometry_column.h | 3 + libtiledbsoma/src/utils/arrow_adapter.cc | 59 +++-- libtiledbsoma/src/utils/arrow_adapter.h | 10 + 15 files changed, 461 insertions(+), 310 deletions(-) diff --git a/apis/python/setup.py b/apis/python/setup.py index 37a1703008..4b6214911f 100644 --- a/apis/python/setup.py +++ b/apis/python/setup.py @@ -256,6 +256,11 @@ def run(self): if sys.platform == "darwin": CXX_FLAGS.append("-mmacosx-version-min=13.3") + # This is necessary for clang to enable templated function calls + # between pybind modules and libtiledbsoma where dynamic_cast or + # std::any_cast is involved + CXX_FLAGS.append("-D_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=2") + if os.name == "posix" and sys.platform != "darwin": LIB_DIRS.append(str(tiledbsoma_dir / "lib" / "x86_64-linux-gnu")) LIB_DIRS.append(str(tiledbsoma_dir / "lib64")) diff --git a/apis/python/src/tiledbsoma/soma_array.cc b/apis/python/src/tiledbsoma/soma_array.cc index dce4057aba..cc6bcb35f7 100644 --- a/apis/python/src/tiledbsoma/soma_array.cc +++ b/apis/python/src/tiledbsoma/soma_array.cc @@ -758,7 +758,8 @@ void load_soma_array(py::module& m) { array.non_empty_domain_slot(name)); case TILEDB_STRING_UTF8: case TILEDB_STRING_ASCII: - return py::cast(array.non_empty_domain_slot_var(name)); + return py::cast( + array.non_empty_domain_slot(name)); default: throw TileDBSOMAError( "Unsupported dtype for nonempty domain."); @@ -814,7 +815,8 @@ void load_soma_array(py::module& m) { array.non_empty_domain_slot_opt(name)); case TILEDB_STRING_UTF8: case TILEDB_STRING_ASCII: - return py::cast(array.non_empty_domain_slot_var(name)); + return py::cast( + array.non_empty_domain_slot_opt(name)); default: throw TileDBSOMAError( "Unsupported dtype for nonempty domain."); diff --git a/libtiledbsoma/CMakeLists.txt b/libtiledbsoma/CMakeLists.txt index 6d0ff30125..ea70190ecd 100644 --- a/libtiledbsoma/CMakeLists.txt +++ b/libtiledbsoma/CMakeLists.txt @@ -89,6 +89,8 @@ set(CMAKE_CXX_EXTENSIONS OFF) # Don't use GNU extensions # Build with fPIC set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_VERBOSE_MAKEFILE ON) + # Set default builds/configuration to be Release. get_property(is_multi_config GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) if (is_multi_config) @@ -221,7 +223,7 @@ if(MSVC) ) else() - set(TILEDBSOMA_COMPILE_OPTIONS -Wall -Wextra -DSPDLOG_USE_STD_FORMAT) + set(TILEDBSOMA_COMPILE_OPTIONS -Wall -Wextra -DSPDLOG_USE_STD_FORMAT -D_LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION=2) if(${TILEDBSOMA_ENABLE_WERROR}) set(TILEDBSOMA_WERROR_OPTION -Werror) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 30e8df7a6a..779fb40545 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -12,8 +12,12 @@ #include "soma_array.h" #include +#include #include "../utils/logger.h" #include "../utils/util.h" +#include "soma_attribute.h" +#include "soma_dimension.h" +#include "soma_geometry_column.h" namespace tiledbsoma { using namespace tiledb; @@ -266,7 +270,8 @@ void SOMAArray::open(OpenMode mode, std::optional timestamp) { validate(mode, name_, timestamp); reset(column_names(), batch_size_, result_order_); - fill_metadata_cache(timestamp); + fill_metadata_cache(timestamp_); + fill_columns(); } std::unique_ptr SOMAArray::reopen( @@ -490,115 +495,24 @@ std::optional SOMAArray::timestamp() { // The domainish enum simply lets us re-use code which is common across // core domain, core current domain, and core non-empty domain. ArrowTable SOMAArray::_get_core_domainish(enum Domainish which_kind) { - int array_ndim = this->ndim(); - auto dimensions = tiledb_schema()->domain().dimensions(); + int array_ndim = std::count_if( + columns_.begin(), columns_.end(), [](const auto& col) { + return col->isIndexColumn(); + }); - // Create the schema for the info we return - std::vector names(array_ndim); - std::vector tiledb_datatypes(array_ndim); - - for (int i = 0; i < (int)array_ndim; i++) { - const Dimension& core_dim = dimensions[i]; - names[i] = core_dim.name(); - tiledb_datatypes[i] = core_dim.type(); - } - - auto arrow_schema = ArrowAdapter::make_arrow_schema( - names, tiledb_datatypes); - - // Create the data for the info we return + auto arrow_schema = ArrowAdapter::make_arrow_schema_parent(array_ndim); auto arrow_array = ArrowAdapter::make_arrow_array_parent(array_ndim); - for (int i = 0; i < array_ndim; i++) { - auto core_dim = dimensions[i]; - auto core_type_code = core_dim.type(); + size_t child_index = 0; + for (const auto& column : + columns_ | std::views::filter( + [](const auto& col) { return col->isIndexColumn(); })) { + arrow_schema->children[child_index] = column->arrow_schema_slot( + *ctx_, *arr_); + arrow_array->children[child_index] = column->arrow_domain_slot( + *ctx_, *arr_, which_kind); - ArrowArray* child = nullptr; - - switch (core_type_code) { - case TILEDB_INT64: - case TILEDB_DATETIME_YEAR: - case TILEDB_DATETIME_MONTH: - case TILEDB_DATETIME_WEEK: - case TILEDB_DATETIME_DAY: - case TILEDB_DATETIME_HR: - case TILEDB_DATETIME_MIN: - case TILEDB_DATETIME_SEC: - case TILEDB_DATETIME_MS: - case TILEDB_DATETIME_US: - case TILEDB_DATETIME_NS: - case TILEDB_DATETIME_PS: - case TILEDB_DATETIME_FS: - case TILEDB_DATETIME_AS: - case TILEDB_TIME_HR: - case TILEDB_TIME_MIN: - case TILEDB_TIME_SEC: - case TILEDB_TIME_MS: - case TILEDB_TIME_US: - case TILEDB_TIME_NS: - case TILEDB_TIME_PS: - case TILEDB_TIME_FS: - case TILEDB_TIME_AS: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_UINT64: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot( - core_dim.name(), which_kind)); - break; - case TILEDB_INT32: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_UINT32: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot( - core_dim.name(), which_kind)); - break; - case TILEDB_INT16: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_UINT16: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot( - core_dim.name(), which_kind)); - break; - case TILEDB_INT8: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_UINT8: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - - case TILEDB_FLOAT64: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - case TILEDB_FLOAT32: - child = ArrowAdapter::make_arrow_array_child( - _core_domainish_slot(core_dim.name(), which_kind)); - break; - - case TILEDB_STRING_ASCII: - case TILEDB_CHAR: - case TILEDB_GEOM_WKB: - case TILEDB_GEOM_WKT: - child = ArrowAdapter::make_arrow_array_child_string( - _core_domainish_slot_string(core_dim.name(), which_kind)); - break; - - default: - throw TileDBSOMAError(std::format( - "SOMAArray::_get_core_domainish:dim {} has unhandled type " - "{}", - core_dim.name(), - tiledb::impl::type_to_str(core_type_code))); - } - arrow_array->children[i] = child; + ++child_index; } return ArrowTable(std::move(arrow_array), std::move(arrow_schema)); @@ -1696,4 +1610,30 @@ void SOMAArray::_check_dims_are_int64() { } } +std::shared_ptr SOMAArray::get_column(std::string_view name) const { + auto result = std::find_if(columns_.begin(), columns_.end(), [&](auto col) { + return col->name() == name; + }); + + if (result == columns_.end()) { + throw TileDBSOMAError(std::format( + "[SOMAArray] internal coding error: No column named {} found", + name)); + } + + return *result; +} + +std::shared_ptr SOMAArray::get_column(std::size_t index) const { + if (index >= columns_.size()) { + throw TileDBSOMAError(std::format( + "[SOMAArray] internal coding error: Column index outside of range. " + "Requested {}, but {} exist.", + index, + columns_.size())); + } + + return columns_[index]; +} + } // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index ef682b5165..30bba9571b 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -336,7 +336,7 @@ class SOMAArray : public SOMAObject { */ template void set_dim_point(const std::string& dim, const T& point) { - mq_->select_point(dim, point); + get_column(dim)->set_dim_point(mq_, *ctx_, point); } /** @@ -382,10 +382,10 @@ class SOMAArray : public SOMAObject { start + partition_size - 1, points.size())); - mq_->select_points( - dim, std::span{&points[start], partition_size}); + get_column(dim)->set_dim_points( + mq_, *ctx_, std::span{&points[start], partition_size}); } else { - mq_->select_points(dim, points); + get_column(dim)->set_dim_points(mq_, *ctx_, points); } } @@ -403,7 +403,7 @@ class SOMAArray : public SOMAObject { LOG_DEBUG( "[SOMAArray] set_dim_points: sizeof(T)=" + std::to_string(sizeof(T))); - mq_->select_points(dim, points); + get_column(dim)->set_dim_points(mq_, *ctx_, std::span(points)); } /** @@ -418,7 +418,7 @@ class SOMAArray : public SOMAObject { template void set_dim_ranges( const std::string& dim, const std::vector>& ranges) { - mq_->select_ranges(dim, ranges); + get_column(dim)->set_dim_ranges(mq_, *ctx_, ranges); } /** @@ -443,7 +443,9 @@ class SOMAArray : public SOMAObject { */ void select_columns( const std::vector& names, bool if_not_empty = false) { - mq_->select_columns(names, if_not_empty); + for (const std::string& name : names) { + get_column(name)->select_columns(mq_, if_not_empty); + } } /** @@ -592,7 +594,7 @@ class SOMAArray : public SOMAObject { * * @return size_t Total number of cells read */ - size_t total_num_cells() { + std::size_t total_num_cells() { return mq_->total_num_cells(); } @@ -756,11 +758,7 @@ class SOMAArray : public SOMAObject { */ template std::pair non_empty_domain_slot(const std::string& name) const { - try { - return arr_->non_empty_domain(name); - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } + return get_column(name)->non_empty_domain_slot(*arr_); } /** @@ -771,45 +769,7 @@ class SOMAArray : public SOMAObject { template std::optional> non_empty_domain_slot_opt( const std::string& name) const { - try { - int32_t is_empty; - T ned[2]; - - // TODO currently we need to use the TileDB C API in order to check - // if the domain is empty or not. The C++ API returns (0, 0) - // currently which could also represent a single point at coordinate - // 0. Replace this when the C++ API supports correct checking for - // empty domains - ctx_->tiledb_ctx()->handle_error( - tiledb_array_get_non_empty_domain_from_name( - ctx_->tiledb_ctx()->ptr().get(), - arr_->ptr().get(), - name.c_str(), - &ned, - &is_empty)); - - if (is_empty == 1) { - return std::nullopt; - } else { - return std::make_pair(ned[0], ned[1]); - } - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } - } - - /** - * Retrieves the non-empty domain from the array on the given dimension. - * This is the union of the non-empty domains of the array fragments. - * Applicable only to var-sized dimensions. - */ - std::pair non_empty_domain_slot_var( - const std::string& name) const { - try { - return arr_->non_empty_domain_var(name); - } catch (const std::exception& e) { - throw TileDBSOMAError(e.what()); - } + return get_column(name)->non_empty_domain_slot_opt(*ctx_, *arr_); } /** @@ -848,63 +808,7 @@ class SOMAArray : public SOMAObject { */ template std::pair _core_current_domain_slot(const std::string& name) const { - if (std::is_same_v) { - throw std::runtime_error( - "SOMAArray::soma_domain_slot: template-specialization " - "failure."); - } - CurrentDomain current_domain = _get_current_domain(); - if (current_domain.is_empty()) { - throw TileDBSOMAError( - "_core_current_domain_slot: internal coding error"); - } - if (current_domain.type() != TILEDB_NDRECTANGLE) { - throw TileDBSOMAError( - "_core_current_domain_slot: found non-rectangle type"); - } - NDRectangle ndrect = current_domain.ndrectangle(); - - // Convert from two-element array (core API) to pair (tiledbsoma API) - std::array arr = ndrect.range(name); - return std::pair(arr[0], arr[1]); - } - - std::pair _core_current_domain_slot_string( - const std::string& name) const { - CurrentDomain current_domain = _get_current_domain(); - if (current_domain.is_empty()) { - throw TileDBSOMAError( - "_core_current_domain_slot: internal coding error"); - } - if (current_domain.type() != TILEDB_NDRECTANGLE) { - throw TileDBSOMAError( - "_core_current_domain_slot: found non-rectangle type"); - } - NDRectangle ndrect = current_domain.ndrectangle(); - - // Convert from two-element array (core API) to pair (tiledbsoma API) - std::array arr = ndrect.range(name); - - // Here is an intersection of a few oddities: - // - // * Core domain for string dims must be a nullptr pair; it cannot be - // anything else. - // * TileDB-Py shows this by using an empty-string pair, which we - // imitate. - // * Core current domain for string dims must _not_ be a nullptr pair. - // * In TileDB-SOMA, unless the user specifies otherwise, we use "" for - // min and "\x7f" for max. (We could use "\x7f" but that causes - // display problems in Python.) - // - // To work with all these factors, if the current domain is the default - // "" to "\7f", return an empty-string pair just as we do for domain. - // (There was some pre-1.15 software using "\xff" and it's super-cheap - // to check for that as well.) - if (arr[0] == "" && (arr[1] == "\x7f" || arr[1] == "\xff")) { - return std::pair("", ""); - } else { - return std::pair(arr[0], arr[1]); - } + return get_column(name)->core_current_domain_slot(*ctx_, *arr_); } /** @@ -923,22 +827,9 @@ class SOMAArray : public SOMAObject { */ template std::pair _core_domain_slot(const std::string& name) const { - if (std::is_same_v) { - throw std::runtime_error( - "SOMAArray::_core_domain_slot: template-specialization " - "failure."); - } return schema_->domain().dimension(name).domain(); } - std::pair _core_domain_slot_string( - const std::string&) const { - // Core domain for string dims is always a nullptr pair at the C++ - // level. We follow the convention started by TileDB-Py which is to - // report these as an empty-string pair. - return std::pair("", ""); - } - /** * Returns the SOMA domain at the given dimension. * @@ -1032,39 +923,7 @@ class SOMAArray : public SOMAObject { template std::pair _core_domainish_slot( const std::string& name, enum Domainish which_kind) const { - if (std::is_same_v) { - throw std::runtime_error( - "SOMAArray::_core_domainish_slot: template-specialization " - "failure."); - } - switch (which_kind) { - case Domainish::kind_core_domain: - return _core_domain_slot(name); - case Domainish::kind_core_current_domain: - return _core_current_domain_slot(name); - case Domainish::kind_non_empty_domain: - return non_empty_domain_slot(name); - default: - throw std::runtime_error( - "internal coding error in SOMAArray::_core_domainish_slot: " - "unknown kind"); - } - } - - std::pair _core_domainish_slot_string( - const std::string& name, enum Domainish which_kind) const { - switch (which_kind) { - case Domainish::kind_core_domain: - return _core_domain_slot_string(name); - case Domainish::kind_core_current_domain: - return _core_current_domain_slot_string(name); - case Domainish::kind_non_empty_domain: - return non_empty_domain_slot_var(name); - default: - throw std::runtime_error( - "internal coding error in " - "SOMAArray::_core_domainish_slot_string: unknown kind"); - } + return get_column(name)->domain_slot(*ctx_, *arr_, which_kind); } /** @@ -1287,6 +1146,10 @@ class SOMAArray : public SOMAObject { _set_domain_helper(newdomain, false, function_name_for_messages); } + std::shared_ptr get_column(std::string_view name) const; + + std::shared_ptr get_column(std::size_t index) const; + protected: // See top-of-file notes regarding methods for SOMADataFrame being // defined in this file. diff --git a/libtiledbsoma/src/soma/soma_attribute.cc b/libtiledbsoma/src/soma/soma_attribute.cc index 710946d17e..2c57768abf 100644 --- a/libtiledbsoma/src/soma/soma_attribute.cc +++ b/libtiledbsoma/src/soma/soma_attribute.cc @@ -113,6 +113,14 @@ std::any SOMAAttribute::_non_empty_domain_slot(Array&) const { name())); } +std::any SOMAAttribute::_non_empty_domain_slot_opt( + const SOMAContext&, Array&) const { + throw TileDBSOMAError(std::format( + "[SOMAAttribute][_non_empty_domain_slot] Column with name {} is not an " + "index column", + name())); +} + std::any SOMAAttribute::_core_current_domain_slot( const SOMAContext&, Array&) const { throw TileDBSOMAError(std::format( @@ -152,4 +160,4 @@ void SOMAAttribute::serialize(nlohmann::json& columns_schema) const { columns_schema.push_back(column); } -} // namespace tiledbsoma \ No newline at end of file +} // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_attribute.h b/libtiledbsoma/src/soma/soma_attribute.h index 91ce472998..ef6b6c23f3 100644 --- a/libtiledbsoma/src/soma/soma_attribute.h +++ b/libtiledbsoma/src/soma/soma_attribute.h @@ -129,6 +129,9 @@ class SOMAAttribute : public SOMAColumn { std::any _non_empty_domain_slot(Array& array) const override; + std::any _non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const override; + std::any _core_current_domain_slot( const SOMAContext& ctx, Array& array) const override; diff --git a/libtiledbsoma/src/soma/soma_column.cc b/libtiledbsoma/src/soma/soma_column.cc index 98e0592aa5..bfb8cffd68 100644 --- a/libtiledbsoma/src/soma/soma_column.cc +++ b/libtiledbsoma/src/soma/soma_column.cc @@ -142,9 +142,36 @@ SOMAColumn::core_current_domain_slot( if (current_domain.first == "" && (current_domain.second == "\x7f" || current_domain.second == "\xff")) { return std::pair("", ""); + } else { + throw TileDBSOMAError(std::format( + "[SOMAColumn][core_current_domain_slot] unexpected current " + "domain returnd ({}, {})", + current_domain.first, + current_domain.second)); } + } catch (const std::exception& e) { + throw TileDBSOMAError(e.what()); + } +} + +template <> +std::pair +SOMAColumn::core_current_domain_slot(NDRectangle& ndrect) const { + try { + std::pair + current_domain = std::any_cast>( + _core_current_domain_slot(ndrect)); - return current_domain; + if (current_domain.first == "" && (current_domain.second == "\x7f" || + current_domain.second == "\xff")) { + return std::pair("", ""); + } else { + throw TileDBSOMAError(std::format( + "[SOMAColumn][core_current_domain_slot] unexpected current " + "domain returnd ({}, {})", + current_domain.first, + current_domain.second)); + } } catch (const std::exception& e) { throw TileDBSOMAError(e.what()); } diff --git a/libtiledbsoma/src/soma/soma_column.h b/libtiledbsoma/src/soma/soma_column.h index 3d1fa94a0d..397759f696 100644 --- a/libtiledbsoma/src/soma/soma_column.h +++ b/libtiledbsoma/src/soma/soma_column.h @@ -381,6 +381,12 @@ class SOMAColumn { */ template std::pair core_domain_slot() const { + if (std::is_same_v) { + throw std::runtime_error( + "SOMAArray::soma_domain_slot: template-specialization " + "failure."); + } + try { return std::any_cast>(_core_domain_slot()); } catch (const std::exception& e) { @@ -394,8 +400,8 @@ class SOMAColumn { /** * Retrieves the non-empty domain from the array. This is the union of the - * non-empty domains of the array fragments. Returns (0, 0) for empty - * domains. + * non-empty domains of the array fragments. Returns (0, 0) or ("", "") for + * empty domains. */ template std::pair non_empty_domain_slot(Array& array) const { @@ -411,6 +417,26 @@ class SOMAColumn { } } + /** + * Retrieves the non-empty domain from the array. This is the union of the + * non-empty domains of the array fragments. Returns (0, 0) or ("", "") for + * empty domains. + */ + template + std::optional> non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const { + try { + return std::any_cast>>( + _non_empty_domain_slot_opt(ctx, array)); + } catch (const std::exception& e) { + throw TileDBSOMAError(std::format( + "[SOMAColumn][non_empty_domain_slot_opt] Failed on \"{}\" with " + "error \"{}\"", + name(), + e.what())); + } + } + /** * Returns the core current domain of this column. * @@ -428,6 +454,12 @@ class SOMAColumn { template std::pair core_current_domain_slot( const SOMAContext& ctx, Array& array) const { + if (std::is_same_v) { + throw std::runtime_error( + "SOMAArray::soma_domain_slot: template-specialization " + "failure."); + } + try { return std::any_cast>( _core_current_domain_slot(ctx, array)); @@ -489,6 +521,9 @@ class SOMAColumn { virtual std::any _non_empty_domain_slot(Array& array) const = 0; + virtual std::any _non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const = 0; + virtual std::any _core_current_domain_slot( const SOMAContext& ctx, Array& array) const = 0; @@ -510,5 +545,9 @@ std::pair SOMAColumn::core_current_domain_slot( const SOMAContext& ctx, Array& array) const; +template <> +std::pair +SOMAColumn::core_current_domain_slot(NDRectangle& ndrect) const; + } // namespace tiledbsoma #endif diff --git a/libtiledbsoma/src/soma/soma_dimension.cc b/libtiledbsoma/src/soma/soma_dimension.cc index dad9a3a01d..36a34a2154 100644 --- a/libtiledbsoma/src/soma/soma_dimension.cc +++ b/libtiledbsoma/src/soma/soma_dimension.cc @@ -123,8 +123,6 @@ void SOMADimension::_set_dim_points( break; case TILEDB_STRING_UTF8: case TILEDB_STRING_ASCII: - case TILEDB_CHAR: - case TILEDB_BLOB: query->select_points( dimension.name(), std::any_cast>(points)); @@ -215,10 +213,6 @@ void SOMADimension::_set_dim_ranges( break; case TILEDB_STRING_UTF8: case TILEDB_STRING_ASCII: - case TILEDB_CHAR: - case TILEDB_BLOB: - case TILEDB_GEOM_WKT: - case TILEDB_GEOM_WKB: query->select_ranges( dimension.name(), std::any_cast>>( @@ -296,10 +290,7 @@ void SOMADimension::_set_current_domain_slot( } break; case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_BLOB: - case TILEDB_GEOM_WKT: - case TILEDB_GEOM_WKB: { + case TILEDB_GEOM_WKT: { // Here is an intersection of a few oddities: // // * Core domain for string dims must be a nullptr pair; it cannot @@ -453,11 +444,7 @@ std::pair SOMADimension::_can_set_current_domain_slot( return comparator( std::any_cast>(new_domain[0])); case TILEDB_STRING_ASCII: - case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_BLOB: - case TILEDB_GEOM_WKT: - case TILEDB_GEOM_WKB: { + case TILEDB_STRING_UTF8: { auto dom = std::any_cast>(new_domain[0]); if (dom[0] != "" || dom[1] != "") { return std::pair( @@ -575,10 +562,6 @@ std::any SOMADimension::_non_empty_domain_slot(Array& array) const { array.non_empty_domain(dimension.name())); case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: - case TILEDB_BLOB: - case TILEDB_CHAR: - case TILEDB_GEOM_WKB: - case TILEDB_GEOM_WKT: return std::make_any>( array.non_empty_domain_var(dimension.name())); default: @@ -589,6 +572,199 @@ std::any SOMADimension::_non_empty_domain_slot(Array& array) const { } } +std::any SOMADimension::_non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const { + int32_t is_empty; + + if (dimension.type() == TILEDB_STRING_ASCII || + dimension.type() == TILEDB_STRING_UTF8) { + void* var_start; + void* var_end; + uint64_t size_start, size_end; + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_var_size_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimension.name().c_str(), + &size_start, + &size_end, + &is_empty)); + + if (is_empty) { + return std::make_any< + std::optional>>( + std::nullopt); + } + + var_start = malloc(size_start); + var_end = malloc(size_end); + + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_var_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimension.name().c_str(), + var_start, + var_end, + &is_empty)); + + auto ned = std::make_pair( + std::string((char*)var_start, size_start), + std::string((char*)var_end, size_end)); + free(var_start); + free(var_end); + + return std::make_any< + std::optional>>(ned); + } + + void* fixed_ned = malloc(16); + ctx.tiledb_ctx()->handle_error(tiledb_array_get_non_empty_domain_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimension.name().c_str(), + fixed_ned, + &is_empty)); + + if (is_empty) { + // We free buffer here and return later the correctly typed optional + free(fixed_ned); + } + + switch (dimension.type()) { + case TILEDB_UINT8: { + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((uint8_t*)fixed_ned)[0], ((uint8_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + } + case TILEDB_UINT16: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((uint16_t*)fixed_ned)[0], ((uint16_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_UINT32: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((uint32_t*)fixed_ned)[0], ((uint32_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_UINT64: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((uint64_t*)fixed_ned)[0], ((uint64_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_INT8: + if (is_empty) { + return std::make_any>>( + std::nullopt); + } else { + auto data = std::make_pair( + ((int8_t*)fixed_ned)[0], ((int8_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any>>( + data); + } + case TILEDB_INT16: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((int16_t*)fixed_ned)[0], ((int16_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_INT32: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((int32_t*)fixed_ned)[0], ((int32_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_DATETIME_YEAR: + case TILEDB_DATETIME_MONTH: + case TILEDB_DATETIME_WEEK: + case TILEDB_DATETIME_DAY: + case TILEDB_DATETIME_HR: + case TILEDB_DATETIME_MIN: + case TILEDB_DATETIME_SEC: + case TILEDB_DATETIME_MS: + case TILEDB_DATETIME_US: + case TILEDB_DATETIME_NS: + case TILEDB_DATETIME_PS: + case TILEDB_DATETIME_FS: + case TILEDB_DATETIME_AS: + case TILEDB_INT64: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((int64_t*)fixed_ned)[0], ((int64_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_FLOAT32: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((float_t*)fixed_ned)[0], ((float_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + case TILEDB_FLOAT64: + if (is_empty) { + return std::make_any< + std::optional>>(std::nullopt); + } else { + auto data = std::make_pair( + ((double_t*)fixed_ned)[0], ((double_t*)fixed_ned)[1]); + free(fixed_ned); + return std::make_any< + std::optional>>(data); + } + default: + throw TileDBSOMAError(std::format( + "[SOMADimension][_non_empty_domain_slot] Unknown " + "dimension " + "type {}", + impl::type_to_str(dimension.type()))); + } +} + std::any SOMADimension::_core_current_domain_slot( const SOMAContext& ctx, Array& array) const { CurrentDomain @@ -675,11 +851,7 @@ std::any SOMADimension::_core_current_domain_slot(NDRectangle& ndrect) const { std::make_pair(domain[0], domain[1])); } case TILEDB_STRING_UTF8: - case TILEDB_STRING_ASCII: - case TILEDB_CHAR: - case TILEDB_BLOB: - case TILEDB_GEOM_WKT: - case TILEDB_GEOM_WKB: { + case TILEDB_STRING_ASCII: { std::array domain = ndrect.range( dimension.name()); return std::make_any>( @@ -749,9 +921,6 @@ ArrowArray* SOMADimension::arrow_domain_slot( domain_slot(ctx, array, kind)); case TILEDB_STRING_ASCII: case TILEDB_STRING_UTF8: - case TILEDB_CHAR: - case TILEDB_GEOM_WKB: - case TILEDB_GEOM_WKT: return ArrowAdapter::make_arrow_array_child_string( domain_slot(ctx, array, kind)); default: @@ -778,4 +947,4 @@ void SOMADimension::serialize(nlohmann::json& columns_schema) const { columns_schema.push_back(column); } -} // namespace tiledbsoma \ No newline at end of file +} // namespace tiledbsoma diff --git a/libtiledbsoma/src/soma/soma_dimension.h b/libtiledbsoma/src/soma/soma_dimension.h index ea34296907..abf512c8fe 100644 --- a/libtiledbsoma/src/soma/soma_dimension.h +++ b/libtiledbsoma/src/soma/soma_dimension.h @@ -122,6 +122,9 @@ class SOMADimension : public SOMAColumn { std::any _non_empty_domain_slot(Array& array) const override; + std::any _non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const override; + std::any _core_current_domain_slot( const SOMAContext& ctx, Array& array) const override; diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc index 6ea8c14881..537d3ecc71 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.cc +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -391,6 +391,52 @@ std::any SOMAGeometryColumn::_non_empty_domain_slot(Array& array) const { std::make_pair(min, max)); } +std::any SOMAGeometryColumn::_non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const { + std::vector min, max; + size_t dimensionality = dimensions.size() / 2; + int32_t is_empty; + double_t fixed_ned[2]; + + for (size_t i = 0; i < dimensionality; ++i) { + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimensions[i].name().c_str(), // Min dimension + fixed_ned, + &is_empty)); + + if (is_empty) { + return std::make_any, std::vector>>>( + std::nullopt); + } + + min.push_back(fixed_ned[0]); + + ctx.tiledb_ctx()->handle_error( + tiledb_array_get_non_empty_domain_from_name( + ctx.tiledb_ctx()->ptr().get(), + array.ptr().get(), + dimensions[i].name().c_str(), // Max dimension + fixed_ned, + &is_empty)); + + if (is_empty) { + return std::make_any, std::vector>>>( + std::nullopt); + } + + min.push_back(fixed_ned[1]); + } + + return std::make_any< + std::optional, std::vector>>>( + std::make_pair(min, max)); +} + std::any SOMAGeometryColumn::_core_current_domain_slot( const SOMAContext& ctx, Array& array) const { CurrentDomain diff --git a/libtiledbsoma/src/soma/soma_geometry_column.h b/libtiledbsoma/src/soma/soma_geometry_column.h index 9792af6d84..1b8a0e7624 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.h +++ b/libtiledbsoma/src/soma/soma_geometry_column.h @@ -130,6 +130,9 @@ class SOMAGeometryColumn : public SOMAColumn { std::any _non_empty_domain_slot(Array& array) const override; + std::any _non_empty_domain_slot_opt( + const SOMAContext& ctx, Array& array) const override; + std::any _core_current_domain_slot( const SOMAContext& ctx, Array& array) const override; diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index 4519c4549a..e3e12a1229 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -11,8 +11,10 @@ * This file defines the ArrowAdapter class. */ -#include "arrow_adapter.h" +#include + #include "../soma/column_buffer.h" +#include "arrow_adapter.h" #include "logger.h" #include "util.h" @@ -20,6 +22,10 @@ #include "../soma/soma_dimension.h" #include "../soma/soma_geometry_column.h" +#include "../soma/soma_attribute.h" +#include "../soma/soma_dimension.h" +#include "../soma/soma_geometry_column.h" + namespace tiledbsoma { using namespace tiledb; @@ -968,13 +974,22 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( } } + LOG_DEBUG(std::format("[ArrowAdapter] Additional schema metadata")); + nlohmann::json soma_schema_extension; + soma_schema_extension[TILEDB_SOMA_SCHEMA_COL_KEY] = nlohmann::json::array(); + soma_schema_extension["version"] = TILEDB_SOMA_SCHEMA_VERSION; + // Unit tests expect dimension order should match the index column schema // and NOT the Arrow schema + // We generate the additional schema metadata here to ensure that the + // serialized column order matches the expected schema order for (int64_t i = 0; i < index_column_schema->n_children; ++i) { LOG_DEBUG(std::format("[ArrowAdapter] child {}", i)); const auto column = util::find_column_by_name( columns, index_column_schema->children[i]->name); + column->serialize(soma_schema_extension[TILEDB_SOMA_SCHEMA_COL_KEY]); + if (column->tiledb_dimensions().has_value()) { // Intermediate variable required to avoid lifetime issues auto dimensions = column->tiledb_dimensions().value(); @@ -999,10 +1014,10 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( } } - for (auto column : columns) { - if (column->isIndexColumn()) { - continue; - } + for (const auto& column : columns | std::views::filter([](const auto& col) { + return !col->isIndexColumn(); + })) { + column->serialize(soma_schema_extension[TILEDB_SOMA_SCHEMA_COL_KEY]); if (column->tiledb_enumerations().has_value()) { auto enumerations = column->tiledb_enumerations().value(); @@ -1067,15 +1082,6 @@ ArrowAdapter::tiledb_schema_from_arrow_schema( LOG_DEBUG(std::format("[ArrowAdapter] check")); schema.check(); - LOG_DEBUG(std::format("[ArrowAdapter] Additional schema metadata")); - nlohmann::json soma_schema_extension; - - soma_schema_extension[TILEDB_SOMA_SCHEMA_COL_KEY] = nlohmann::json::array(); - for (const auto& column : columns) { - column->serialize(soma_schema_extension[TILEDB_SOMA_SCHEMA_COL_KEY]); - } - soma_schema_extension["version"] = TILEDB_SOMA_SCHEMA_VERSION; - LOG_DEBUG(std::format("[ArrowAdapter] returning")); return std::make_tuple(schema, soma_schema_extension); } @@ -1604,6 +1610,31 @@ std::unique_ptr ArrowAdapter::make_arrow_schema( return arrow_schema; } +std::unique_ptr ArrowAdapter::make_arrow_schema_parent( + int num_columns) { + auto arrow_schema = std::make_unique(); + arrow_schema->format = "+s"; // structure, i.e. non-leaf node + arrow_schema->name = strdup("parent"); + arrow_schema->metadata = nullptr; + arrow_schema->flags = 0; + arrow_schema->n_children = num_columns; // non-leaf node + arrow_schema->children = (ArrowSchema**)malloc( + arrow_schema->n_children * sizeof(ArrowSchema*)); + arrow_schema->dictionary = nullptr; + arrow_schema->release = &ArrowAdapter::release_schema; + arrow_schema->private_data = nullptr; + + for (int i = 0; i < num_columns; i++) { + arrow_schema->children[i] = nullptr; + } + + LOG_DEBUG(std::format( + "[ArrowAdapter] make_arrow_schema n_children {}", + arrow_schema->n_children)); + + return arrow_schema; +} + std::unique_ptr ArrowAdapter::make_arrow_array_parent( int num_columns) { auto arrow_array = std::make_unique(); diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index 2f851d884c..4004974aa9 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -490,6 +490,16 @@ class ArrowAdapter { const std::vector& names, const std::vector& tiledb_datatypes); + /** + * @brief Creates a nanoarrow ArrowSchema which accommodates + * a varying number of columns. + * + * Note that the parents and children in nanoarrow are both of type + * ArrowSchema. This constructs the parent and not the children. + */ + static std::unique_ptr make_arrow_schema_parent( + int num_columns); + /** * @brief Creates a nanoarrow ArrowArray which accommodates * a varying number of columns.