From 4f69131ea351f96c99c525479af1fc2fa2636775 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Thu, 21 Nov 2024 14:15:24 +0200 Subject: [PATCH 01/16] Add minimal testing for dimensions --- libtiledbsoma/src/tiledbsoma/tiledbsoma | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libtiledbsoma/src/tiledbsoma/tiledbsoma b/libtiledbsoma/src/tiledbsoma/tiledbsoma index 98e3075639..b465f82e5e 100644 --- a/libtiledbsoma/src/tiledbsoma/tiledbsoma +++ b/libtiledbsoma/src/tiledbsoma/tiledbsoma @@ -50,6 +50,8 @@ #include "soma/soma_coordinates.h" #include "soma/soma_array.h" #include "soma/soma_collection.h" +#include "soma/soma_column.h" +#include "soma/soma_dimension.h" #include "soma/soma_dataframe.h" #include "soma/soma_group.h" #include "soma/soma_experiment.h" From 3949a62297d8c29bc98f2d7982f15b142db45174 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Thu, 21 Nov 2024 14:15:24 +0200 Subject: [PATCH 02/16] Add minimal testing for dimensions --- libtiledbsoma/src/tiledbsoma/tiledbsoma | 1 + libtiledbsoma/test/unit_soma_column.cc | 137 +++++++++++++----------- 2 files changed, 77 insertions(+), 61 deletions(-) diff --git a/libtiledbsoma/src/tiledbsoma/tiledbsoma b/libtiledbsoma/src/tiledbsoma/tiledbsoma index b465f82e5e..59b3d89b1b 100644 --- a/libtiledbsoma/src/tiledbsoma/tiledbsoma +++ b/libtiledbsoma/src/tiledbsoma/tiledbsoma @@ -52,6 +52,7 @@ #include "soma/soma_collection.h" #include "soma/soma_column.h" #include "soma/soma_dimension.h" +#include "soma/soma_geometry_column.h" #include "soma/soma_dataframe.h" #include "soma/soma_group.h" #include "soma/soma_experiment.h" diff --git a/libtiledbsoma/test/unit_soma_column.cc b/libtiledbsoma/test/unit_soma_column.cc index 94c619c45e..45a2920235 100644 --- a/libtiledbsoma/test/unit_soma_column.cc +++ b/libtiledbsoma/test/unit_soma_column.cc @@ -30,8 +30,6 @@ * This file manages unit tests for implementation of SOMAColumn class */ -#include -#include #include #include "../src/soma/soma_attribute.h" #include "../src/soma/soma_column.h" @@ -200,68 +198,85 @@ struct VariouslyIndexedDataFrameFixture { }; TEST_CASE("SOMAColumn: SOMADimension") { + auto use_current_domain = GENERATE(false, true); auto ctx = std::make_shared(); PlatformConfig platform_config{}; - std::vector dim_infos( - {helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_UINT32, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A"}), - helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A"}), - helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_INT64, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A"}), - helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_STRING_ASCII, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A"})}); - - std::vector geom_dim_infos({helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_GEOM_WKB, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A"})}); - - std::vector spatial_dim_infos( - {helper::DimInfo( - {.name = "x", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 200, - .string_lo = "N/A", - .string_hi = "N/A"}), - helper::DimInfo( - {.name = "y", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A"})}); - - auto index_columns = helper::create_column_index_info(dim_infos); - - std::vector> columns; - - for (int64_t i = 0; i < index_columns.second->n_children; ++i) { - columns.push_back(SOMADimension::create( - ctx->tiledb_ctx(), - index_columns.second->children[i], - index_columns.first->children[i], - "SOMAGeometryDataFrame", - "", - platform_config)); + SECTION(std::format("- use_current_domain={}", use_current_domain)) { + std::vector dim_infos( + {helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_UINT32, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_INT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_STRING_ASCII, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain})}); + + std::vector geom_dim_infos({helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_GEOM_WKB, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain})}); + + std::vector spatial_dim_infos( + {helper::DimInfo( + {.name = "x", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 200, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain}), + helper::DimInfo( + {.name = "y", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A", + .use_current_domain = use_current_domain})}); + + auto index_columns = helper::create_column_index_info(dim_infos); + + std::vector> columns; + bool has_current_domain = true; + + for (int64_t i = 0; i < index_columns.second->n_children; ++i) { + columns.push_back(SOMADimension::create( + ctx->tiledb_ctx(), + index_columns.second->children[i], + index_columns.first->children[i], + "SOMAGeometryDataFrame", + "", + platform_config, + has_current_domain)); + + REQUIRE(has_current_domain == use_current_domain); + REQUIRE( + columns.back()->tiledb_dimensions().value()[0].type() == + dim_infos[i].tiledb_datatype); + } REQUIRE( columns.back()->tiledb_dimensions().value()[0].type() == From dbd2f3fef403b5da2362ddfd48817ce37c818a2a Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Fri, 22 Nov 2024 17:55:23 +0200 Subject: [PATCH 03/16] Add read test case --- libtiledbsoma/test/unit_soma_column.cc | 189 +++++++++++++++++++++++++ 1 file changed, 189 insertions(+) diff --git a/libtiledbsoma/test/unit_soma_column.cc b/libtiledbsoma/test/unit_soma_column.cc index 45a2920235..3daeba1a2c 100644 --- a/libtiledbsoma/test/unit_soma_column.cc +++ b/libtiledbsoma/test/unit_soma_column.cc @@ -30,6 +30,8 @@ * This file manages unit tests for implementation of SOMAColumn class */ +#include +#include #include #include "../src/soma/soma_attribute.h" #include "../src/soma/soma_column.h" @@ -615,3 +617,190 @@ TEST_CASE_METHOD( REQUIRE(ext_res->num_rows() == 1); } } + +TEST_CASE_METHOD( + VariouslyIndexedDataFrameFixture, + "SOMAColumn: query variant-indexed dataframe dim-str-u32 attr-sjid", + "[SOMADataFrame]") { + auto use_current_domain = GENERATE(false, true); + SECTION(std::format("- use_current_domain={}", use_current_domain)) { + auto specify_domain = GENERATE(false, true); + SECTION(std::format("- specify_domain={}", specify_domain)) { + std::string suffix1 = use_current_domain ? "true" : "false"; + std::string suffix2 = specify_domain ? "true" : "false"; + set_up( + std::make_shared(), + "mem://unit-test-column-variant-indexed-dataframe-4-" + + suffix1 + "-" + suffix2); + + std::string string_lo = specify_domain ? "apple" : ""; + std::string string_hi = specify_domain ? "zebra" : ""; + std::vector dim_infos( + {str_dim_info(use_current_domain, string_lo, string_hi), + u32_dim_info(use_current_domain)}); + std::vector attr_infos({i64_attr_info()}); + + // Create + create(dim_infos, attr_infos); + + // Check current domain + auto sdf = open(OpenMode::read); + + // External column initialization + auto raw_array = tiledb::Array( + *ctx_->tiledb_ctx(), uri_, TILEDB_READ); + std::vector> columns; + + for (auto dimension : sdf->tiledb_schema()->domain().dimensions()) { + columns.push_back( + std::make_shared(SOMADimension(dimension))); + } + + for (size_t i = 0; i < sdf->tiledb_schema()->attribute_num(); ++i) { + columns.push_back(std::make_shared( + SOMAAttribute(sdf->tiledb_schema()->attribute(i)))); + } + + CurrentDomain current_domain = sdf->get_current_domain_for_test(); + if (!use_current_domain) { + REQUIRE(current_domain.is_empty()); + } else { + REQUIRE(!current_domain.is_empty()); + REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); + NDRectangle ndrect = current_domain.ndrectangle(); + + std::array + str_range = ndrect.range(dim_infos[0].name); + std::pair str_external = + columns[0]->core_current_domain_slot( + *ctx_, raw_array); + + if (specify_domain) { + REQUIRE(str_range[0] == str_external.first); + REQUIRE(str_range[1] == str_external.second); + } else { + // Can we write empty strings in this range? + REQUIRE(str_range[0] <= ""); + REQUIRE(str_external.first <= ""); + REQUIRE(str_range[1] >= ""); + REQUIRE(str_external.second >= ""); + // Can we write ASCII values in this range? + REQUIRE(str_range[0] < " "); + REQUIRE(str_external.first <= " "); + REQUIRE(str_range[1] > "~"); + // REQUIRE(str_external.second >= "~"); + } + + std::array u32_range = ndrect.range( + dim_infos[1].name); + std::pair u32_external = + columns[1]->core_current_domain_slot( + *ctx_, raw_array); + REQUIRE(u32_range[0] == u32_external.first); + REQUIRE(u32_range[1] == u32_external.second); + } + + // Check shape before write + std::optional actual = sdf->maybe_soma_joinid_shape(); + REQUIRE(!actual.has_value()); + + // Check domainish accessors before resize + ArrowTable non_empty_domain = sdf->get_non_empty_domain(); + std::vector + ned_str = ArrowAdapter::get_table_string_column_by_name( + non_empty_domain, "mystring"); + + std::vector + ned_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_non_empty_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + ArrowTable soma_domain = sdf->get_soma_domain(); + std::vector + dom_str = ArrowAdapter::get_table_string_column_by_name( + soma_domain, "mystring"); + + std::vector + dom_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, + raw_array, + use_current_domain ? + Domainish::kind_core_current_domain : + Domainish::kind_core_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + ArrowTable soma_maxdomain = sdf->get_soma_maxdomain(); + std::vector + maxdom_str = ArrowAdapter::get_table_string_column_by_name( + soma_maxdomain, "mystring"); + + std::vector + maxdom_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + REQUIRE(ned_str == std::vector({"", ""})); + + REQUIRE(ned_str == ned_str_col); + REQUIRE(dom_str == dom_str_col); + REQUIRE(maxdom_str == maxdom_str_col); + + if (!use_current_domain) { + REQUIRE(dom_str == std::vector({"", ""})); + REQUIRE(maxdom_str == std::vector({"", ""})); + } else { + if (specify_domain) { + REQUIRE(dom_str[0] == dim_infos[0].string_lo); + REQUIRE(dom_str[1] == dim_infos[0].string_hi); + } else { + REQUIRE(dom_str == std::vector({"", ""})); + } + REQUIRE(maxdom_str == std::vector({"", ""})); + } + + sdf->close(); + + sdf = open(OpenMode::write); + write_sjid_u32_str_data_from(0); + + sdf->close(); + + sdf = open(OpenMode::read); + REQUIRE(sdf->nnz() == 2); + + sdf->close(); + + auto external_query = std::make_unique( + open(OpenMode::read), ctx_->tiledb_ctx()); + + columns[1]->select_columns(external_query); + columns[1]->set_dim_point(external_query, *ctx_, 1234); + + // Configure query and allocate result buffers + external_query->setup_read(); + external_query->submit_read(); + auto ext_res = external_query->results(); + + REQUIRE(ext_res->num_rows() == 1); + + external_query->reset(); + + columns[0]->select_columns(external_query); + columns[0]->set_dim_ranges( + external_query, + *ctx_, + std::vector( + {std::make_pair("apple", "b")})); + + // Configure query and allocate result buffers + external_query->setup_read(); + external_query->submit_read(); + ext_res = external_query->results(); + + REQUIRE(ext_res->num_rows() == 1); + } + } +} From 3f31c664b19b649f42ab9cbeaf0140eb1d199259 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Sun, 24 Nov 2024 18:20:58 +0200 Subject: [PATCH 04/16] Remove current_domain flag --- libtiledbsoma/test/unit_soma_column.cc | 474 ++++++++++++------------- 1 file changed, 219 insertions(+), 255 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_column.cc b/libtiledbsoma/test/unit_soma_column.cc index 3daeba1a2c..53117be93c 100644 --- a/libtiledbsoma/test/unit_soma_column.cc +++ b/libtiledbsoma/test/unit_soma_column.cc @@ -200,85 +200,68 @@ struct VariouslyIndexedDataFrameFixture { }; TEST_CASE("SOMAColumn: SOMADimension") { - auto use_current_domain = GENERATE(false, true); auto ctx = std::make_shared(); PlatformConfig platform_config{}; - SECTION(std::format("- use_current_domain={}", use_current_domain)) { - std::vector dim_infos( - {helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_UINT32, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A", - .use_current_domain = use_current_domain}), - helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A", - .use_current_domain = use_current_domain}), - helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_INT64, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A", - .use_current_domain = use_current_domain}), - helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_STRING_ASCII, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A", - .use_current_domain = use_current_domain})}); - - std::vector geom_dim_infos({helper::DimInfo( - {.name = "dimension", - .tiledb_datatype = TILEDB_GEOM_WKB, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A", - .use_current_domain = use_current_domain})}); - - std::vector spatial_dim_infos( - {helper::DimInfo( - {.name = "x", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 200, - .string_lo = "N/A", - .string_hi = "N/A", - .use_current_domain = use_current_domain}), - helper::DimInfo( - {.name = "y", - .tiledb_datatype = TILEDB_FLOAT64, - .dim_max = 100, - .string_lo = "N/A", - .string_hi = "N/A", - .use_current_domain = use_current_domain})}); - - auto index_columns = helper::create_column_index_info(dim_infos); - - std::vector> columns; - bool has_current_domain = true; - - for (int64_t i = 0; i < index_columns.second->n_children; ++i) { - columns.push_back(SOMADimension::create( - ctx->tiledb_ctx(), - index_columns.second->children[i], - index_columns.first->children[i], - "SOMAGeometryDataFrame", - "", - platform_config, - has_current_domain)); - - REQUIRE(has_current_domain == use_current_domain); - REQUIRE( - columns.back()->tiledb_dimensions().value()[0].type() == - dim_infos[i].tiledb_datatype); - } + std::vector dim_infos( + {helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_UINT32, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_INT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"}), + helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_STRING_ASCII, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"})}); + + std::vector geom_dim_infos({helper::DimInfo( + {.name = "dimension", + .tiledb_datatype = TILEDB_GEOM_WKB, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"})}); + + std::vector spatial_dim_infos( + {helper::DimInfo( + {.name = "x", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 200, + .string_lo = "N/A", + .string_hi = "N/A"}), + helper::DimInfo( + {.name = "y", + .tiledb_datatype = TILEDB_FLOAT64, + .dim_max = 100, + .string_lo = "N/A", + .string_hi = "N/A"})}); + + auto index_columns = helper::create_column_index_info(dim_infos); + + std::vector> columns; + + for (int64_t i = 0; i < index_columns.second->n_children; ++i) { + columns.push_back(SOMADimension::create( + ctx->tiledb_ctx(), + index_columns.second->children[i], + index_columns.first->children[i], + "SOMAGeometryDataFrame", + "", + platform_config)); REQUIRE( columns.back()->tiledb_dimensions().value()[0].type() == @@ -622,185 +605,166 @@ TEST_CASE_METHOD( VariouslyIndexedDataFrameFixture, "SOMAColumn: query variant-indexed dataframe dim-str-u32 attr-sjid", "[SOMADataFrame]") { - auto use_current_domain = GENERATE(false, true); - SECTION(std::format("- use_current_domain={}", use_current_domain)) { - auto specify_domain = GENERATE(false, true); - SECTION(std::format("- specify_domain={}", specify_domain)) { - std::string suffix1 = use_current_domain ? "true" : "false"; - std::string suffix2 = specify_domain ? "true" : "false"; - set_up( - std::make_shared(), - "mem://unit-test-column-variant-indexed-dataframe-4-" + - suffix1 + "-" + suffix2); - - std::string string_lo = specify_domain ? "apple" : ""; - std::string string_hi = specify_domain ? "zebra" : ""; - std::vector dim_infos( - {str_dim_info(use_current_domain, string_lo, string_hi), - u32_dim_info(use_current_domain)}); - std::vector attr_infos({i64_attr_info()}); - - // Create - create(dim_infos, attr_infos); - - // Check current domain - auto sdf = open(OpenMode::read); - - // External column initialization - auto raw_array = tiledb::Array( - *ctx_->tiledb_ctx(), uri_, TILEDB_READ); - std::vector> columns; - - for (auto dimension : sdf->tiledb_schema()->domain().dimensions()) { - columns.push_back( - std::make_shared(SOMADimension(dimension))); - } - - for (size_t i = 0; i < sdf->tiledb_schema()->attribute_num(); ++i) { - columns.push_back(std::make_shared( - SOMAAttribute(sdf->tiledb_schema()->attribute(i)))); - } - - CurrentDomain current_domain = sdf->get_current_domain_for_test(); - if (!use_current_domain) { - REQUIRE(current_domain.is_empty()); - } else { - REQUIRE(!current_domain.is_empty()); - REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); - NDRectangle ndrect = current_domain.ndrectangle(); - - std::array - str_range = ndrect.range(dim_infos[0].name); - std::pair str_external = - columns[0]->core_current_domain_slot( - *ctx_, raw_array); - - if (specify_domain) { - REQUIRE(str_range[0] == str_external.first); - REQUIRE(str_range[1] == str_external.second); - } else { - // Can we write empty strings in this range? - REQUIRE(str_range[0] <= ""); - REQUIRE(str_external.first <= ""); - REQUIRE(str_range[1] >= ""); - REQUIRE(str_external.second >= ""); - // Can we write ASCII values in this range? - REQUIRE(str_range[0] < " "); - REQUIRE(str_external.first <= " "); - REQUIRE(str_range[1] > "~"); - // REQUIRE(str_external.second >= "~"); - } - - std::array u32_range = ndrect.range( - dim_infos[1].name); - std::pair u32_external = - columns[1]->core_current_domain_slot( - *ctx_, raw_array); - REQUIRE(u32_range[0] == u32_external.first); - REQUIRE(u32_range[1] == u32_external.second); - } - - // Check shape before write - std::optional actual = sdf->maybe_soma_joinid_shape(); - REQUIRE(!actual.has_value()); - - // Check domainish accessors before resize - ArrowTable non_empty_domain = sdf->get_non_empty_domain(); - std::vector - ned_str = ArrowAdapter::get_table_string_column_by_name( - non_empty_domain, "mystring"); - - std::vector - ned_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_non_empty_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); - - ArrowTable soma_domain = sdf->get_soma_domain(); - std::vector - dom_str = ArrowAdapter::get_table_string_column_by_name( - soma_domain, "mystring"); - - std::vector - dom_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, - raw_array, - use_current_domain ? - Domainish::kind_core_current_domain : - Domainish::kind_core_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); - - ArrowTable soma_maxdomain = sdf->get_soma_maxdomain(); - std::vector - maxdom_str = ArrowAdapter::get_table_string_column_by_name( - soma_maxdomain, "mystring"); - - std::vector - maxdom_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_core_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); - - REQUIRE(ned_str == std::vector({"", ""})); - - REQUIRE(ned_str == ned_str_col); - REQUIRE(dom_str == dom_str_col); - REQUIRE(maxdom_str == maxdom_str_col); - - if (!use_current_domain) { - REQUIRE(dom_str == std::vector({"", ""})); - REQUIRE(maxdom_str == std::vector({"", ""})); - } else { - if (specify_domain) { - REQUIRE(dom_str[0] == dim_infos[0].string_lo); - REQUIRE(dom_str[1] == dim_infos[0].string_hi); - } else { - REQUIRE(dom_str == std::vector({"", ""})); - } - REQUIRE(maxdom_str == std::vector({"", ""})); - } - - sdf->close(); - - sdf = open(OpenMode::write); - write_sjid_u32_str_data_from(0); - - sdf->close(); - - sdf = open(OpenMode::read); - REQUIRE(sdf->nnz() == 2); - - sdf->close(); - - auto external_query = std::make_unique( - open(OpenMode::read), ctx_->tiledb_ctx()); - - columns[1]->select_columns(external_query); - columns[1]->set_dim_point(external_query, *ctx_, 1234); - - // Configure query and allocate result buffers - external_query->setup_read(); - external_query->submit_read(); - auto ext_res = external_query->results(); - - REQUIRE(ext_res->num_rows() == 1); - - external_query->reset(); - - columns[0]->select_columns(external_query); - columns[0]->set_dim_ranges( - external_query, - *ctx_, - std::vector( - {std::make_pair("apple", "b")})); - - // Configure query and allocate result buffers - external_query->setup_read(); - external_query->submit_read(); - ext_res = external_query->results(); - - REQUIRE(ext_res->num_rows() == 1); + auto specify_domain = GENERATE(false, true); + SECTION(std::format("- specify_domain={}", specify_domain)) { + std::string suffix1 = specify_domain ? "true" : "false"; + set_up( + std::make_shared(), + "mem://unit-test-column-variant-indexed-dataframe-4-" + suffix1); + + std::string string_lo = specify_domain ? "apple" : ""; + std::string string_hi = specify_domain ? "zebra" : ""; + std::vector dim_infos( + {str_dim_info(true, string_lo, string_hi), u32_dim_info(true)}); + std::vector attr_infos({i64_attr_info()}); + + // Create + create(dim_infos, attr_infos); + + // Check current domain + auto sdf = open(OpenMode::read); + + // External column initialization + auto raw_array = tiledb::Array(*ctx_->tiledb_ctx(), uri_, TILEDB_READ); + std::vector> columns; + + for (auto dimension : sdf->tiledb_schema()->domain().dimensions()) { + columns.push_back( + std::make_shared(SOMADimension(dimension))); + } + + for (size_t i = 0; i < sdf->tiledb_schema()->attribute_num(); ++i) { + columns.push_back(std::make_shared( + SOMAAttribute(sdf->tiledb_schema()->attribute(i)))); + } + + CurrentDomain current_domain = sdf->get_current_domain_for_test(); + + REQUIRE(!current_domain.is_empty()); + REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); + NDRectangle ndrect = current_domain.ndrectangle(); + + std::array str_range = ndrect.range( + dim_infos[0].name); + std::pair + str_external = columns[0]->core_current_domain_slot( + *ctx_, raw_array); + + if (specify_domain) { + REQUIRE(str_range[0] == str_external.first); + REQUIRE(str_range[1] == str_external.second); + } else { + // Can we write empty strings in this range? + REQUIRE(str_range[0] <= ""); + REQUIRE(str_external.first <= ""); + REQUIRE(str_range[1] >= ""); + REQUIRE(str_external.second >= ""); + // Can we write ASCII values in this range? + REQUIRE(str_range[0] < " "); + REQUIRE(str_external.first <= " "); + REQUIRE(str_range[1] > "~"); + // REQUIRE(str_external.second >= "~"); + } + + std::array u32_range = ndrect.range( + dim_infos[1].name); + std::pair + u32_external = columns[1]->core_current_domain_slot( + *ctx_, raw_array); + REQUIRE(u32_range[0] == u32_external.first); + REQUIRE(u32_range[1] == u32_external.second); + + // Check shape before write + std::optional actual = sdf->maybe_soma_joinid_shape(); + REQUIRE(!actual.has_value()); + + // Check domainish accessors before resize + ArrowTable non_empty_domain = sdf->get_non_empty_domain(); + std::vector + ned_str = ArrowAdapter::get_table_string_column_by_name( + non_empty_domain, "mystring"); + + std::vector + ned_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_non_empty_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + ArrowTable soma_domain = sdf->get_soma_domain(); + std::vector + dom_str = ArrowAdapter::get_table_string_column_by_name( + soma_domain, "mystring"); + + std::vector + dom_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_current_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + ArrowTable soma_maxdomain = sdf->get_soma_maxdomain(); + std::vector + maxdom_str = ArrowAdapter::get_table_string_column_by_name( + soma_maxdomain, "mystring"); + + std::vector + maxdom_str_col = ArrowAdapter::get_array_string_column( + columns[0]->arrow_domain_slot( + *ctx_, raw_array, Domainish::kind_core_domain), + columns[0]->arrow_schema_slot(*ctx_, raw_array)); + + REQUIRE(ned_str == std::vector({"", ""})); + + REQUIRE(ned_str == ned_str_col); + REQUIRE(dom_str == dom_str_col); + REQUIRE(maxdom_str == maxdom_str_col); + + if (specify_domain) { + REQUIRE(dom_str[0] == dim_infos[0].string_lo); + REQUIRE(dom_str[1] == dim_infos[0].string_hi); + } else { + REQUIRE(dom_str == std::vector({"", ""})); } + REQUIRE(maxdom_str == std::vector({"", ""})); + + sdf->close(); + + sdf = open(OpenMode::write); + write_sjid_u32_str_data_from(0); + + sdf->close(); + + sdf = open(OpenMode::read); + REQUIRE(sdf->nnz() == 2); + + sdf->close(); + + auto external_query = std::make_unique( + open(OpenMode::read), ctx_->tiledb_ctx()); + + columns[1]->select_columns(external_query); + columns[1]->set_dim_point(external_query, *ctx_, 1234); + + // Configure query and allocate result buffers + external_query->setup_read(); + external_query->submit_read(); + auto ext_res = external_query->results(); + + REQUIRE(ext_res->num_rows() == 1); + + external_query->reset(); + + columns[0]->select_columns(external_query); + columns[0]->set_dim_ranges( + external_query, + *ctx_, + std::vector( + {std::make_pair("apple", "b")})); + + // Configure query and allocate result buffers + external_query->setup_read(); + external_query->submit_read(); + ext_res = external_query->results(); + + REQUIRE(ext_res->num_rows() == 1); } } From a401e7469b844cd9c3e12951e543bcd2513a2fff Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Mon, 25 Nov 2024 13:09:39 +0200 Subject: [PATCH 05/16] Do not export soma column [skip ci] --- libtiledbsoma/src/tiledbsoma/tiledbsoma | 3 --- 1 file changed, 3 deletions(-) diff --git a/libtiledbsoma/src/tiledbsoma/tiledbsoma b/libtiledbsoma/src/tiledbsoma/tiledbsoma index 59b3d89b1b..98e3075639 100644 --- a/libtiledbsoma/src/tiledbsoma/tiledbsoma +++ b/libtiledbsoma/src/tiledbsoma/tiledbsoma @@ -50,9 +50,6 @@ #include "soma/soma_coordinates.h" #include "soma/soma_array.h" #include "soma/soma_collection.h" -#include "soma/soma_column.h" -#include "soma/soma_dimension.h" -#include "soma/soma_geometry_column.h" #include "soma/soma_dataframe.h" #include "soma/soma_group.h" #include "soma/soma_experiment.h" From 4a37e913ade7a1d7bddf042160d51420815821f1 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Fri, 6 Dec 2024 14:27:20 +0200 Subject: [PATCH 06/16] Replace string_view with string when returning column name, add current domain checks, replace vector with span when selecting points --- libtiledbsoma/test/unit_soma_column.cc | 31 +++++++++++--------------- 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_column.cc b/libtiledbsoma/test/unit_soma_column.cc index 53117be93c..6ee99990c0 100644 --- a/libtiledbsoma/test/unit_soma_column.cc +++ b/libtiledbsoma/test/unit_soma_column.cc @@ -612,10 +612,10 @@ TEST_CASE_METHOD( std::make_shared(), "mem://unit-test-column-variant-indexed-dataframe-4-" + suffix1); - std::string string_lo = specify_domain ? "apple" : ""; - std::string string_hi = specify_domain ? "zebra" : ""; + std::string string_lo = ""; + std::string string_hi = ""; std::vector dim_infos( - {str_dim_info(true, string_lo, string_hi), u32_dim_info(true)}); + {str_dim_info(string_lo, string_hi), u32_dim_info()}); std::vector attr_infos({i64_attr_info()}); // Create @@ -650,21 +650,16 @@ TEST_CASE_METHOD( str_external = columns[0]->core_current_domain_slot( *ctx_, raw_array); - if (specify_domain) { - REQUIRE(str_range[0] == str_external.first); - REQUIRE(str_range[1] == str_external.second); - } else { - // Can we write empty strings in this range? - REQUIRE(str_range[0] <= ""); - REQUIRE(str_external.first <= ""); - REQUIRE(str_range[1] >= ""); - REQUIRE(str_external.second >= ""); - // Can we write ASCII values in this range? - REQUIRE(str_range[0] < " "); - REQUIRE(str_external.first <= " "); - REQUIRE(str_range[1] > "~"); - // REQUIRE(str_external.second >= "~"); - } + // Can we write empty strings in this range? + REQUIRE(str_range[0] <= ""); + REQUIRE(str_external.first <= ""); + REQUIRE(str_range[1] >= ""); + REQUIRE(str_external.second >= ""); + // Can we write ASCII values in this range? + REQUIRE(str_range[0] < " "); + REQUIRE(str_external.first <= " "); + REQUIRE(str_range[1] > "~"); + // REQUIRE(str_external.second >= "~"); std::array u32_range = ndrect.range( dim_infos[1].name); From c2deec8d522eba1c239cfb5ec7a397aa073140f8 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Thu, 16 Jan 2025 13:09:29 +0200 Subject: [PATCH 07/16] Add serialization/deserelization methods --- libtiledbsoma/src/soma/soma_attribute.cc | 42 +++++++++++++ libtiledbsoma/src/soma/soma_attribute.h | 9 +++ libtiledbsoma/src/soma/soma_column.cc | 47 ++++++++++++++ libtiledbsoma/src/soma/soma_column.h | 16 +++++ libtiledbsoma/src/soma/soma_dimension.cc | 33 ++++++++++ libtiledbsoma/src/soma/soma_dimension.h | 9 +++ .../src/soma/soma_geometry_column.cc | 63 +++++++++++++++++++ libtiledbsoma/src/soma/soma_geometry_column.h | 8 +++ libtiledbsoma/src/utils/common.h | 6 ++ 9 files changed, 233 insertions(+) diff --git a/libtiledbsoma/src/soma/soma_attribute.cc b/libtiledbsoma/src/soma/soma_attribute.cc index 49e834124f..da589a3d60 100644 --- a/libtiledbsoma/src/soma/soma_attribute.cc +++ b/libtiledbsoma/src/soma/soma_attribute.cc @@ -33,6 +33,38 @@ #include "soma_attribute.h" namespace tiledbsoma { +std::shared_ptr SOMAAttribute::deserialize( + nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_ATTR_KEY)) { + throw TileDBSOMAError( + "[SOMAAttribute][deserialize] Missing required field " + "'tiledb_attributes'"); + } + + std::vector + attribute_names = soma_schema[TDB_SOMA_SCHEMA_COL_ATTR_KEY] + .template get>(); + + if (attribute_names.size() != 1) { + throw TileDBSOMAError(std::format( + "[SOMAAttribute][deserialize] Invalid number of attributes. " + "Epected 1, got {}", + attribute_names.size())); + } + + auto attribute = array.schema().attribute(attribute_names[0]); + auto enumeration_name = AttributeExperimental::get_enumeration_name( + ctx, attribute); + + std::optional + enumeration = enumeration_name ? + std::make_optional(ArrayExperimental::get_enumeration( + ctx, array, attribute.name())) : + std::nullopt; + + return std::make_shared(attribute, enumeration); +} + std::shared_ptr SOMAAttribute::create( std::shared_ptr ctx, ArrowSchema* schema, @@ -124,4 +156,14 @@ ArrowSchema* SOMAAttribute::arrow_schema_slot( attribute, *ctx.tiledb_ctx(), array) .release(); } + +void SOMAAttribute::serialize(nlohmann::json& columns_schema) const { + nlohmann::json column; + + column[TDB_SOMA_SCHEMA_COL_TYPE_KEY] = static_cast( + soma_column_datatype_t::SOMA_COLUMN_ATTRIBUTE); + column[TDB_SOMA_SCHEMA_COL_ATTR_KEY] = {attribute.name()}; + + columns_schema.push_back(column); +} } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_attribute.h b/libtiledbsoma/src/soma/soma_attribute.h index 57bb90748f..4a57c98236 100644 --- a/libtiledbsoma/src/soma/soma_attribute.h +++ b/libtiledbsoma/src/soma/soma_attribute.h @@ -47,6 +47,13 @@ using namespace tiledb; class SOMAAttribute : public SOMAColumn { public: + //=================================================================== + //= public static + //=================================================================== + + static std::shared_ptr deserialize( + nlohmann::json& soma_schema, const Context& ctx, const Array& array); + /** * Create a ``SOMAAttribute`` shared pointer from an Arrow schema */ @@ -114,6 +121,8 @@ class SOMAAttribute : public SOMAColumn { ArrowSchema* arrow_schema_slot( const SOMAContext& ctx, Array& array) override; + void serialize(nlohmann::json&) const override; + private: void _set_dim_points( const std::unique_ptr& query, diff --git a/libtiledbsoma/src/soma/soma_column.cc b/libtiledbsoma/src/soma/soma_column.cc index d55888ff79..1a95039419 100644 --- a/libtiledbsoma/src/soma/soma_column.cc +++ b/libtiledbsoma/src/soma/soma_column.cc @@ -32,8 +32,55 @@ #include "soma_column.h" +#include "soma_attribute.h" +#include "soma_dimension.h" +#include "soma_geometry_column.h" + namespace tiledbsoma { +std::map SOMAColumn::deserialiser_map = { + {soma_column_datatype_t::SOMA_COLUMN_ATTRIBUTE, SOMAAttribute::deserialize}, + {soma_column_datatype_t::SOMA_COLUMN_DIMENSION, SOMADimension::deserialize}, + {soma_column_datatype_t::SOMA_COLUMN_GEOMETRY, + SOMAGeometryColumn::deserialize}}; + +std::vector> SOMAColumn::deserialize( + nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + std::vector> columns; + + if (soma_schema.contains(TDB_SOMA_SCHEMA_COL_KEY)) { + for (auto& column : soma_schema[TDB_SOMA_SCHEMA_COL_KEY]) { + auto type = column[TDB_SOMA_SCHEMA_COL_TYPE_KEY] + .template get(); + + columns.push_back(deserialiser_map[type](column, ctx, array)); + } + } else { + // All arrays before the introduction of SOMAColumn do not have + // composite columns, thus the metadata are trivially contructuble + for (auto& dimension : array.schema().domain().dimensions()) { + columns.push_back(std::make_shared(dimension)); + } + + for (auto& attribute : array.schema().attributes()) { + auto enumeration_name = AttributeExperimental::get_enumeration_name( + ctx, attribute.second); + auto enumeration = enumeration_name.has_value() ? + std::make_optional( + ArrayExperimental::get_enumeration( + ctx, + array, + attribute.second.name())) : + std::nullopt; + + columns.push_back( + std::make_shared(attribute.second, enumeration)); + } + } + + return columns; +} + template <> std::pair SOMAColumn::core_domain_slot() const { diff --git a/libtiledbsoma/src/soma/soma_column.h b/libtiledbsoma/src/soma/soma_column.h index 4b2f513819..8e13a74bad 100644 --- a/libtiledbsoma/src/soma/soma_column.h +++ b/libtiledbsoma/src/soma/soma_column.h @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -52,7 +53,17 @@ namespace tiledbsoma { using namespace tiledb; class SOMAColumn { + typedef std::shared_ptr (*Factory)( + nlohmann::json&, const Context&, const Array&); + public: + //=================================================================== + //= public static + //=================================================================== + + static std::vector> deserialize( + nlohmann::json& soma_schema, const Context& ctx, const Array& array); + //=================================================================== //= public non-static //=================================================================== @@ -474,6 +485,8 @@ class SOMAColumn { } } + virtual void serialize(nlohmann::json&) const = 0; + protected: virtual void _set_dim_points( const std::unique_ptr& query, @@ -500,6 +513,9 @@ class SOMAColumn { const SOMAContext& ctx, Array& array) const = 0; virtual std::any _core_current_domain_slot(NDRectangle& ndrect) const = 0; + + private: + static std::map deserialiser_map; }; template <> diff --git a/libtiledbsoma/src/soma/soma_dimension.cc b/libtiledbsoma/src/soma/soma_dimension.cc index 7f667bf323..f3af0894b9 100644 --- a/libtiledbsoma/src/soma/soma_dimension.cc +++ b/libtiledbsoma/src/soma/soma_dimension.cc @@ -35,6 +35,30 @@ namespace tiledbsoma { +std::shared_ptr SOMADimension::deserialize( + nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_DIM_KEY)) { + throw TileDBSOMAError( + "[SOMADimension][deserialize] Missing required field " + "'tiledb_dimensions'"); + } + + std::vector + dimension_names = soma_schema[TDB_SOMA_SCHEMA_COL_DIM_KEY] + .template get>(); + + if (dimension_names.size() != 1) { + throw TileDBSOMAError(std::format( + "[SOMADimension][deserialize] Invalid number of dimensions. " + "Epected 1, got {}", + dimension_names.size())); + } + + auto dimension = array.schema().domain().dimension(dimension_names[0]); + + return std::make_shared(dimension); +} + std::shared_ptr SOMADimension::create( std::shared_ptr ctx, ArrowSchema* schema, @@ -764,4 +788,13 @@ ArrowSchema* SOMADimension::arrow_schema_slot(const SOMAContext&, Array&) { .release(); } +void SOMADimension::serialize(nlohmann::json& columns_schema) const { + nlohmann::json column; + + column[TDB_SOMA_SCHEMA_COL_TYPE_KEY] = static_cast( + soma_column_datatype_t::SOMA_COLUMN_DIMENSION); + column[TDB_SOMA_SCHEMA_COL_DIM_KEY] = {dimension.name()}; + + columns_schema.push_back(column); +} } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_dimension.h b/libtiledbsoma/src/soma/soma_dimension.h index b556476bc2..f0f97e7efd 100644 --- a/libtiledbsoma/src/soma/soma_dimension.h +++ b/libtiledbsoma/src/soma/soma_dimension.h @@ -48,6 +48,13 @@ using namespace tiledb; class SOMADimension : public SOMAColumn { public: + //=================================================================== + //= public static + //=================================================================== + + static std::shared_ptr deserialize( + nlohmann::json& soma_schema, const Context& ctx, const Array& array); + static std::shared_ptr create( std::shared_ptr ctx, ArrowSchema* schema, @@ -107,6 +114,8 @@ class SOMADimension : public SOMAColumn { ArrowSchema* arrow_schema_slot( const SOMAContext& ctx, Array& array) override; + void serialize(nlohmann::json&) const override; + protected: void _set_dim_points( const std::unique_ptr& query, diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc index 51e5fefecf..99816cfe98 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.cc +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -33,6 +33,52 @@ #include "soma_geometry_column.h" namespace tiledbsoma { +std::shared_ptr SOMAGeometryColumn::deserialize( + nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_DIM_KEY)) { + throw TileDBSOMAError( + "[SOMAGeometryColumn][deserialize] Missing required field " + "'tiledb_dimensions'"); + } + if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_ATTR_KEY)) { + throw TileDBSOMAError( + "[SOMAGeometryColumn][deserialize] Missing required field " + "'tiledb_attributes'"); + } + + std::vector + dimension_names = soma_schema[TDB_SOMA_SCHEMA_COL_DIM_KEY] + .template get>(); + + std::vector + attribute_names = soma_schema[TDB_SOMA_SCHEMA_COL_ATTR_KEY] + .template get>(); + + if (dimension_names.size() % 2 != 0) { + throw TileDBSOMAError(std::format( + "[SOMAGeometryColumn][deserialize] Invalid number of dimensions. " + "Epected number divisible by 2, got {}", + dimension_names.size())); + } + if (attribute_names.size() != 1) { + throw TileDBSOMAError(std::format( + "[SOMAGeometryColumn][deserialize] Invalid number of attributes. " + "Epected 1, got {}", + attribute_names.size())); + } + + std::vector dimensions; + std::for_each( + dimension_names.cbegin(), + dimension_names.cend(), + [&array, &dimensions](const std::string& name) { + dimensions.push_back(array.schema().domain().dimension(name)); + }); + + auto attribute = array.schema().attribute(attribute_names[0]); + + return std::make_shared(dimensions, attribute); +} std::shared_ptr SOMAGeometryColumn::create( std::shared_ptr ctx, @@ -415,4 +461,21 @@ ArrowSchema* SOMAGeometryColumn::arrow_schema_slot( .release(); } +void SOMAGeometryColumn::serialize(nlohmann::json& columns_schema) const { + nlohmann::json column; + + column[TDB_SOMA_SCHEMA_COL_TYPE_KEY] = static_cast( + soma_column_datatype_t::SOMA_COLUMN_GEOMETRY); + + column[TDB_SOMA_SCHEMA_COL_DIM_KEY] = nlohmann::json::array(); + std::for_each( + dimensions.cbegin(), + dimensions.cend(), + [&column](const Dimension& dim) { + column[TDB_SOMA_SCHEMA_COL_DIM_KEY].push_back(dim.name()); + }); + column[TDB_SOMA_SCHEMA_COL_ATTR_KEY] = {attribute.name()}; + + columns_schema.push_back(column); +} } // namespace tiledbsoma \ No newline at end of file diff --git a/libtiledbsoma/src/soma/soma_geometry_column.h b/libtiledbsoma/src/soma/soma_geometry_column.h index db1995bb5d..16c5a33918 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.h +++ b/libtiledbsoma/src/soma/soma_geometry_column.h @@ -56,6 +56,12 @@ using namespace tiledb; class SOMAGeometryColumn : public SOMAColumn { public: + //=================================================================== + //= public static + //=================================================================== + static std::shared_ptr deserialize( + nlohmann::json& soma_schema, const Context& ctx, const Array& array); + static std::shared_ptr create( std::shared_ptr ctx, ArrowSchema* schema, @@ -116,6 +122,8 @@ class SOMAGeometryColumn : public SOMAColumn { ArrowSchema* arrow_schema_slot( const SOMAContext& ctx, Array& array) override; + void serialize(nlohmann::json&) const override; + protected: void _set_dim_points( const std::unique_ptr& query, diff --git a/libtiledbsoma/src/utils/common.h b/libtiledbsoma/src/utils/common.h index 13632a173c..eea811475d 100644 --- a/libtiledbsoma/src/utils/common.h +++ b/libtiledbsoma/src/utils/common.h @@ -50,6 +50,12 @@ const std::string SOMA_GEOMETRY_COLUMN_NAME = "soma_geometry"; const std::string SOMA_GEOMETRY_DIMENSION_PREFIX = "tiledb__internal__"; const std::string ARROW_DATATYPE_METADATA_KEY = "dtype"; +// SOMAColumn metadata keys +const std::string TDB_SOMA_SCHEMA_COL_KEY = "tiledb_columns"; +const std::string TDB_SOMA_SCHEMA_COL_TYPE_KEY = "tiledb_column_type"; +const std::string TDB_SOMA_SCHEMA_COL_DIM_KEY = "tiledb_dimensions"; +const std::string TDB_SOMA_SCHEMA_COL_ATTR_KEY = "tiledb_attributes"; + using MetadataValue = std::tuple; enum MetadataInfo { dtype = 0, num, value }; From 3e1e1ffc79cbfc23e8bbbff305d204fc00707881 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Fri, 17 Jan 2025 20:10:17 +0200 Subject: [PATCH 08/16] Serialize SOMAColumn on schema generation --- libtiledbsoma/src/soma/soma_array.cc | 25 +++++++++++++++++++ libtiledbsoma/src/soma/soma_array.h | 7 ++++++ libtiledbsoma/src/soma/soma_attribute.cc | 2 +- libtiledbsoma/src/soma/soma_attribute.h | 4 ++- libtiledbsoma/src/soma/soma_column.cc | 8 +++--- libtiledbsoma/src/soma/soma_column.h | 6 +++-- libtiledbsoma/src/soma/soma_dataframe.cc | 21 ++++++++++------ libtiledbsoma/src/soma/soma_dense_ndarray.cc | 23 +++++++++++------ libtiledbsoma/src/soma/soma_dimension.cc | 2 +- libtiledbsoma/src/soma/soma_dimension.h | 4 ++- .../src/soma/soma_geometry_column.cc | 2 +- libtiledbsoma/src/soma/soma_geometry_column.h | 4 ++- .../src/soma/soma_geometry_dataframe.cc | 24 +++++++++++------- .../src/soma/soma_point_cloud_dataframe.cc | 22 ++++++++++------ libtiledbsoma/src/soma/soma_sparse_ndarray.cc | 23 +++++++++++------ libtiledbsoma/src/utils/arrow_adapter.cc | 14 +++++++++-- libtiledbsoma/src/utils/arrow_adapter.h | 9 ++++--- libtiledbsoma/src/utils/common.h | 2 ++ 18 files changed, 146 insertions(+), 56 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index b2158b93d9..28734b3fd6 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -46,6 +46,7 @@ void SOMAArray::create( std::string_view uri, ArraySchema schema, std::string_view soma_type, + std::string_view soma_schema, std::optional timestamp) { Array::create(std::string(uri), schema); @@ -73,6 +74,12 @@ void SOMAArray::create( TILEDB_STRING_UTF8, static_cast(ENCODING_VERSION_VAL.length()), ENCODING_VERSION_VAL.c_str()); + + array->put_metadata( + TDB_SOMA_SCHEMA_KEY, + TILEDB_STRING_UTF8, + static_cast(soma_schema.length()), + soma_schema.data()); } std::unique_ptr SOMAArray::open( @@ -207,6 +214,24 @@ void SOMAArray::fill_metadata_cache(std::optional timestamp) { } } +void SOMAArray::fill_columns(const nlohmann::json& soma_schema_extension) { + if (!soma_schema_extension.contains(TDB_SOMA_SCHEMA_COL_KEY)) { + LOG_WARN(std::format( + "[SOMAArray][fill_columns] Missing '{}' key from '{}'", + TDB_SOMA_SCHEMA_COL_KEY, + TDB_SOMA_SCHEMA_KEY)); + } + + soma_schema_extension.value( + TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()); + + columns_ = SOMAColumn::deserialize( + soma_schema_extension.value( + TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()), + *ctx_->tiledb_ctx(), + *arr_); +} + const std::string SOMAArray::uri() const { return uri_; }; diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 6b290564ea..88baf62de2 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -45,6 +45,7 @@ #include "enums.h" #include "logger_public.h" #include "managed_query.h" +#include "soma_column.h" #include "soma_object.h" // ================================================================ @@ -112,6 +113,7 @@ class SOMAArray : public SOMAObject { std::string_view uri, ArraySchema schema, std::string_view soma_type, + std::string_view soma_schema, std::optional timestamp = std::nullopt); /** @@ -1538,6 +1540,8 @@ class SOMAArray : public SOMAObject { void fill_metadata_cache(std::optional timestamp); + void fill_columns(const nlohmann::json& soma_schema_extension); + // SOMAArray URI std::string uri_; @@ -1560,6 +1564,9 @@ class SOMAArray : public SOMAObject { // Metadata cache std::map metadata_; + // SOMAColumn list + std::vector> columns_; + // Read timestamp range (start, end) std::optional timestamp_; diff --git a/libtiledbsoma/src/soma/soma_attribute.cc b/libtiledbsoma/src/soma/soma_attribute.cc index da589a3d60..bae5ab6b62 100644 --- a/libtiledbsoma/src/soma/soma_attribute.cc +++ b/libtiledbsoma/src/soma/soma_attribute.cc @@ -34,7 +34,7 @@ namespace tiledbsoma { std::shared_ptr SOMAAttribute::deserialize( - nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + const nlohmann::json& soma_schema, const Context& ctx, const Array& array) { if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_ATTR_KEY)) { throw TileDBSOMAError( "[SOMAAttribute][deserialize] Missing required field " diff --git a/libtiledbsoma/src/soma/soma_attribute.h b/libtiledbsoma/src/soma/soma_attribute.h index 4a57c98236..5525d2f978 100644 --- a/libtiledbsoma/src/soma/soma_attribute.h +++ b/libtiledbsoma/src/soma/soma_attribute.h @@ -52,7 +52,9 @@ class SOMAAttribute : public SOMAColumn { //=================================================================== static std::shared_ptr deserialize( - nlohmann::json& soma_schema, const Context& ctx, const Array& array); + const nlohmann::json& soma_schema, + const Context& ctx, + const Array& array); /** * Create a ``SOMAAttribute`` shared pointer from an Arrow schema diff --git a/libtiledbsoma/src/soma/soma_column.cc b/libtiledbsoma/src/soma/soma_column.cc index 1a95039419..cc4b769600 100644 --- a/libtiledbsoma/src/soma/soma_column.cc +++ b/libtiledbsoma/src/soma/soma_column.cc @@ -45,11 +45,13 @@ std::map SOMAColumn::deserialiser_map = { SOMAGeometryColumn::deserialize}}; std::vector> SOMAColumn::deserialize( - nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + const nlohmann::json& soma_schema_columns, + const Context& ctx, + const Array& array) { std::vector> columns; - if (soma_schema.contains(TDB_SOMA_SCHEMA_COL_KEY)) { - for (auto& column : soma_schema[TDB_SOMA_SCHEMA_COL_KEY]) { + if (!soma_schema_columns.empty()) { + for (auto& column : soma_schema_columns) { auto type = column[TDB_SOMA_SCHEMA_COL_TYPE_KEY] .template get(); diff --git a/libtiledbsoma/src/soma/soma_column.h b/libtiledbsoma/src/soma/soma_column.h index 8e13a74bad..8699aa2e12 100644 --- a/libtiledbsoma/src/soma/soma_column.h +++ b/libtiledbsoma/src/soma/soma_column.h @@ -54,7 +54,7 @@ using namespace tiledb; class SOMAColumn { typedef std::shared_ptr (*Factory)( - nlohmann::json&, const Context&, const Array&); + const nlohmann::json&, const Context&, const Array&); public: //=================================================================== @@ -62,7 +62,9 @@ class SOMAColumn { //=================================================================== static std::vector> deserialize( - nlohmann::json& soma_schema, const Context& ctx, const Array& array); + const nlohmann::json& soma_schema, + const Context& ctx, + const Array& array); //=================================================================== //= public non-static diff --git a/libtiledbsoma/src/soma/soma_dataframe.cc b/libtiledbsoma/src/soma/soma_dataframe.cc index 8a5772778e..4adeff1eae 100644 --- a/libtiledbsoma/src/soma/soma_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_dataframe.cc @@ -48,14 +48,21 @@ void SOMADataFrame::create( std::shared_ptr ctx, PlatformConfig platform_config, std::optional timestamp) { - auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( - ctx->tiledb_ctx(), - schema, - index_columns, + auto [tiledb_schema, soma_schema_extension] = + ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), + schema, + index_columns, + "SOMADataFrame", + true, + platform_config); + SOMAArray::create( + ctx, + uri, + tiledb_schema, "SOMADataFrame", - true, - platform_config); - SOMAArray::create(ctx, uri, tiledb_schema, "SOMADataFrame", timestamp); + soma_schema_extension.dump(), + timestamp); } std::unique_ptr SOMADataFrame::open( diff --git a/libtiledbsoma/src/soma/soma_dense_ndarray.cc b/libtiledbsoma/src/soma/soma_dense_ndarray.cc index da464ff19b..a9d40fb3ca 100644 --- a/libtiledbsoma/src/soma/soma_dense_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_dense_ndarray.cc @@ -79,15 +79,22 @@ void SOMADenseNDArray::create( attr->metadata = nullptr; attr->release = &ArrowAdapter::release_schema; - auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( - ctx->tiledb_ctx(), - schema, - index_columns, + auto [tiledb_schema, soma_schema_extension] = + ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), + schema, + index_columns, + "SOMADenseNDArray", + false, + platform_config); + + SOMAArray::create( + ctx, + uri, + tiledb_schema, "SOMADenseNDArray", - false, - platform_config); - - SOMAArray::create(ctx, uri, tiledb_schema, "SOMADenseNDArray", timestamp); + soma_schema_extension.dump(), + timestamp); } std::unique_ptr SOMADenseNDArray::open( diff --git a/libtiledbsoma/src/soma/soma_dimension.cc b/libtiledbsoma/src/soma/soma_dimension.cc index f3af0894b9..99167cee09 100644 --- a/libtiledbsoma/src/soma/soma_dimension.cc +++ b/libtiledbsoma/src/soma/soma_dimension.cc @@ -36,7 +36,7 @@ namespace tiledbsoma { std::shared_ptr SOMADimension::deserialize( - nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + const nlohmann::json& soma_schema, const Context& ctx, const Array& array) { if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_DIM_KEY)) { throw TileDBSOMAError( "[SOMADimension][deserialize] Missing required field " diff --git a/libtiledbsoma/src/soma/soma_dimension.h b/libtiledbsoma/src/soma/soma_dimension.h index f0f97e7efd..cdb76e9b96 100644 --- a/libtiledbsoma/src/soma/soma_dimension.h +++ b/libtiledbsoma/src/soma/soma_dimension.h @@ -53,7 +53,9 @@ class SOMADimension : public SOMAColumn { //=================================================================== static std::shared_ptr deserialize( - nlohmann::json& soma_schema, const Context& ctx, const Array& array); + const nlohmann::json& soma_schema, + const Context& ctx, + const Array& array); static std::shared_ptr create( std::shared_ptr ctx, diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc index 99816cfe98..f4c47f6617 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.cc +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -34,7 +34,7 @@ namespace tiledbsoma { std::shared_ptr SOMAGeometryColumn::deserialize( - nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + const nlohmann::json& soma_schema, const Context& ctx, const Array& array) { if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_DIM_KEY)) { throw TileDBSOMAError( "[SOMAGeometryColumn][deserialize] Missing required field " diff --git a/libtiledbsoma/src/soma/soma_geometry_column.h b/libtiledbsoma/src/soma/soma_geometry_column.h index 16c5a33918..96528cd454 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.h +++ b/libtiledbsoma/src/soma/soma_geometry_column.h @@ -60,7 +60,9 @@ class SOMAGeometryColumn : public SOMAColumn { //= public static //=================================================================== static std::shared_ptr deserialize( - nlohmann::json& soma_schema, const Context& ctx, const Array& array); + const nlohmann::json& soma_schema, + const Context& ctx, + const Array& array); static std::shared_ptr create( std::shared_ptr ctx, diff --git a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc index 5f3351b079..141aee73f5 100644 --- a/libtiledbsoma/src/soma/soma_geometry_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_geometry_dataframe.cc @@ -55,17 +55,23 @@ void SOMAGeometryDataFrame::create( PlatformConfig platform_config, std::optional timestamp) { std::vector spatial_axes; - auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( - ctx->tiledb_ctx(), - schema, - index_columns, - "SOMAGeometryDataFrame", - true, - platform_config, - spatial_columns); + auto [tiledb_schema, soma_schema_extension] = + ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), + schema, + index_columns, + "SOMAGeometryDataFrame", + true, + platform_config, + spatial_columns); SOMAArray::create( - ctx, uri, tiledb_schema, "SOMAGeometryDataFrame", timestamp); + ctx, + uri, + tiledb_schema, + "SOMAGeometryDataFrame", + soma_schema_extension.dump(), + timestamp); } std::unique_ptr SOMAGeometryDataFrame::open( diff --git a/libtiledbsoma/src/soma/soma_point_cloud_dataframe.cc b/libtiledbsoma/src/soma/soma_point_cloud_dataframe.cc index c41d0ebe5e..1d28e442fc 100644 --- a/libtiledbsoma/src/soma/soma_point_cloud_dataframe.cc +++ b/libtiledbsoma/src/soma/soma_point_cloud_dataframe.cc @@ -46,15 +46,21 @@ void SOMAPointCloudDataFrame::create( std::shared_ptr ctx, PlatformConfig platform_config, std::optional timestamp) { - auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( - ctx->tiledb_ctx(), - schema, - index_columns, - "SOMAPointCloudDataFrame", - true, - platform_config); + auto [tiledb_schema, soma_schema_extension] = + ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), + schema, + index_columns, + "SOMAPointCloudDataFrame", + true, + platform_config); SOMAArray::create( - ctx, uri, tiledb_schema, "SOMAPointCloudDataFrame", timestamp); + ctx, + uri, + tiledb_schema, + "SOMAPointCloudDataFrame", + soma_schema_extension.dump(), + timestamp); } std::unique_ptr SOMAPointCloudDataFrame::open( diff --git a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc index 7fe741ce5a..87a1c1e486 100644 --- a/libtiledbsoma/src/soma/soma_sparse_ndarray.cc +++ b/libtiledbsoma/src/soma/soma_sparse_ndarray.cc @@ -80,15 +80,22 @@ void SOMASparseNDArray::create( attr->metadata = nullptr; attr->release = &ArrowAdapter::release_schema; - auto tiledb_schema = ArrowAdapter::tiledb_schema_from_arrow_schema( - ctx->tiledb_ctx(), - schema, - index_columns, + auto [tiledb_schema, soma_schema_extension] = + ArrowAdapter::tiledb_schema_from_arrow_schema( + ctx->tiledb_ctx(), + schema, + index_columns, + "SOMASparseNDArray", + true, + platform_config); + + SOMAArray::create( + ctx, + uri, + tiledb_schema, "SOMASparseNDArray", - true, - platform_config); - - SOMAArray::create(ctx, uri, tiledb_schema, "SOMASparseNDArray", timestamp); + soma_schema_extension.dump(), + timestamp); } std::unique_ptr SOMASparseNDArray::open( diff --git a/libtiledbsoma/src/utils/arrow_adapter.cc b/libtiledbsoma/src/utils/arrow_adapter.cc index df96812923..e1edfa9ef2 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.cc +++ b/libtiledbsoma/src/utils/arrow_adapter.cc @@ -885,7 +885,8 @@ tiledb_layout_t ArrowAdapter::_get_order(std::string order) { } } -ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( +std::tuple +ArrowAdapter::tiledb_schema_from_arrow_schema( std::shared_ptr ctx, const std::unique_ptr& arrow_schema, const ArrowTable& index_column_info, @@ -1085,8 +1086,17 @@ ArraySchema ArrowAdapter::tiledb_schema_from_arrow_schema( LOG_DEBUG(std::format("[ArrowAdapter] check")); schema.check(); + LOG_DEBUG(std::format("[ArrowAdapter] Additional schema metadata")); + nlohmann::json soma_schema_extension; + + soma_schema_extension[TDB_SOMA_SCHEMA_COL_KEY] = nlohmann::json::array(); + for (const auto& column : columns) { + column->serialize(soma_schema_extension[TDB_SOMA_SCHEMA_COL_KEY]); + } + soma_schema_extension["version"] = TDB_SOMA_SCHEMA_VERSION; + LOG_DEBUG(std::format("[ArrowAdapter] returning")); - return schema; + return std::make_tuple(schema, soma_schema_extension); } Dimension ArrowAdapter::tiledb_dimension_from_arrow_schema( diff --git a/libtiledbsoma/src/utils/arrow_adapter.h b/libtiledbsoma/src/utils/arrow_adapter.h index f0f8693148..78e4b1544c 100644 --- a/libtiledbsoma/src/utils/arrow_adapter.h +++ b/libtiledbsoma/src/utils/arrow_adapter.h @@ -373,7 +373,9 @@ class ArrowAdapter { ArraySchema tiledb_schema); /** - * @brief Create a TileDB ArraySchema from ArrowSchema + * @brief Create a TileDB ArraySchema from ArrowSchema and additional JSON + * encoded metadata to handle higher level SOMA construct not supported by + * TileDB (e.g. SOMAColumn) * * The number of rows in index_column_info was three without core * current-domain support, and is five with core current-domain support: @@ -384,9 +386,10 @@ class ArrowAdapter { * - Slot 3: core current-domain low value (inclusive) * - Slot 4: core current-domain high value (inclusive) * - * @return tiledb::ArraySchema + * @return std::tuple */ - static ArraySchema tiledb_schema_from_arrow_schema( + static std::tuple + tiledb_schema_from_arrow_schema( std::shared_ptr ctx, const std::unique_ptr& arrow_schema, const ArrowTable& index_column_info, diff --git a/libtiledbsoma/src/utils/common.h b/libtiledbsoma/src/utils/common.h index eea811475d..be90fd01b2 100644 --- a/libtiledbsoma/src/utils/common.h +++ b/libtiledbsoma/src/utils/common.h @@ -51,6 +51,8 @@ const std::string SOMA_GEOMETRY_DIMENSION_PREFIX = "tiledb__internal__"; const std::string ARROW_DATATYPE_METADATA_KEY = "dtype"; // SOMAColumn metadata keys +const std::string TDB_SOMA_SCHEMA_KEY = "tiledb_soma_schema"; +const std::string TDB_SOMA_SCHEMA_VERSION = "0.0.1"; const std::string TDB_SOMA_SCHEMA_COL_KEY = "tiledb_columns"; const std::string TDB_SOMA_SCHEMA_COL_TYPE_KEY = "tiledb_column_type"; const std::string TDB_SOMA_SCHEMA_COL_DIM_KEY = "tiledb_dimensions"; From 3c07257db9c990799216d176758e4c62becba924 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Fri, 17 Jan 2025 20:10:40 +0200 Subject: [PATCH 09/16] Update unit tests --- libtiledbsoma/test/unit_soma_array.cc | 12 ++++++------ libtiledbsoma/test/unit_soma_dataframe.cc | 8 ++++---- libtiledbsoma/test/unit_soma_dense_ndarray.cc | 8 ++++---- libtiledbsoma/test/unit_soma_sparse_ndarray.cc | 10 +++++----- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_array.cc b/libtiledbsoma/test/unit_soma_array.cc index d5f09b719f..6d12304d2f 100644 --- a/libtiledbsoma/test/unit_soma_array.cc +++ b/libtiledbsoma/test/unit_soma_array.cc @@ -92,7 +92,7 @@ std::tuple create_array( // Create array SOMAArray::create( - ctx, uri, std::move(schema), "NONE", TimestampRange(0, 2)); + ctx, uri, std::move(schema), "NONE", "", TimestampRange(0, 2)); uint64_t nnz = num_fragments * num_cells_per_fragment; @@ -386,7 +386,7 @@ TEST_CASE("SOMAArray: metadata") { // Read metadata soma_array->open(OpenMode::read, TimestampRange(0, 2)); - REQUIRE(soma_array->metadata_num() == 3); + REQUIRE(soma_array->metadata_num() == 4); REQUIRE(soma_array->has_metadata("soma_object_type")); REQUIRE(soma_array->has_metadata("soma_encoding_version")); REQUIRE(soma_array->has_metadata("md")); @@ -398,7 +398,7 @@ TEST_CASE("SOMAArray: metadata") { // md should not be available at (2, 2) soma_array->open(OpenMode::read, TimestampRange(2, 2)); - REQUIRE(soma_array->metadata_num() == 2); + REQUIRE(soma_array->metadata_num() == 3); REQUIRE(soma_array->has_metadata("soma_object_type")); REQUIRE(soma_array->has_metadata("soma_encoding_version")); REQUIRE(!soma_array->has_metadata("md")); @@ -406,7 +406,7 @@ TEST_CASE("SOMAArray: metadata") { // Metadata should also be retrievable in write mode soma_array->open(OpenMode::write, TimestampRange(0, 2)); - REQUIRE(soma_array->metadata_num() == 3); + REQUIRE(soma_array->metadata_num() == 4); REQUIRE(soma_array->has_metadata("soma_object_type")); REQUIRE(soma_array->has_metadata("soma_encoding_version")); REQUIRE(soma_array->has_metadata("md")); @@ -422,7 +422,7 @@ TEST_CASE("SOMAArray: metadata") { // Confirm delete in read mode soma_array->open(OpenMode::read, TimestampRange(0, 2)); REQUIRE(!soma_array->has_metadata("md")); - REQUIRE(soma_array->metadata_num() == 2); + REQUIRE(soma_array->metadata_num() == 3); } TEST_CASE("SOMAArray: Test buffer size") { @@ -480,7 +480,7 @@ TEST_CASE("SOMAArray: Write and read back Boolean") { schema.add_attribute(attr); schema.set_allows_dups(true); - SOMAArray::create(ctx, uri, std::move(schema), "NONE"); + SOMAArray::create(ctx, uri, std::move(schema), "NONE", ""); auto soma_array = SOMAArray::open(OpenMode::write, uri, ctx); auto arrow_schema = std::make_unique(); diff --git a/libtiledbsoma/test/unit_soma_dataframe.cc b/libtiledbsoma/test/unit_soma_dataframe.cc index cc73a20a83..3600b51e10 100644 --- a/libtiledbsoma/test/unit_soma_dataframe.cc +++ b/libtiledbsoma/test/unit_soma_dataframe.cc @@ -388,7 +388,7 @@ TEST_CASE_METHOD( // Read metadata sdf->open(OpenMode::read, TimestampRange(0, 2)); - REQUIRE(sdf->metadata_num() == 3); + REQUIRE(sdf->metadata_num() == 4); REQUIRE(sdf->has_metadata("soma_object_type")); REQUIRE(sdf->has_metadata("soma_encoding_version")); REQUIRE(sdf->has_metadata("md")); @@ -400,7 +400,7 @@ TEST_CASE_METHOD( // md should not be available at (0, 1) sdf->open(OpenMode::read, TimestampRange(0, 1)); - REQUIRE(sdf->metadata_num() == 2); + REQUIRE(sdf->metadata_num() == 3); REQUIRE(sdf->has_metadata("soma_object_type")); REQUIRE(sdf->has_metadata("soma_encoding_version")); REQUIRE(!sdf->has_metadata("md")); @@ -408,7 +408,7 @@ TEST_CASE_METHOD( // Metadata should also be retrievable in write mode sdf->open(OpenMode::write); - REQUIRE(sdf->metadata_num() == 3); + REQUIRE(sdf->metadata_num() == 4); REQUIRE(sdf->has_metadata("soma_object_type")); REQUIRE(sdf->has_metadata("soma_encoding_version")); REQUIRE(sdf->has_metadata("md")); @@ -424,7 +424,7 @@ TEST_CASE_METHOD( // Confirm delete in read mode sdf->open(OpenMode::read); REQUIRE(!sdf->has_metadata("md")); - REQUIRE(sdf->metadata_num() == 2); + REQUIRE(sdf->metadata_num() == 3); } TEST_CASE_METHOD( diff --git a/libtiledbsoma/test/unit_soma_dense_ndarray.cc b/libtiledbsoma/test/unit_soma_dense_ndarray.cc index 697f87db45..26c7298c85 100644 --- a/libtiledbsoma/test/unit_soma_dense_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_dense_ndarray.cc @@ -180,7 +180,7 @@ TEST_CASE("SOMADenseNDArray: metadata", "[SOMADenseNDArray]") { // Read metadata dnda->open(OpenMode::read, TimestampRange(0, 2)); - REQUIRE(dnda->metadata_num() == 3); + REQUIRE(dnda->metadata_num() == 4); REQUIRE(dnda->has_metadata("soma_object_type")); REQUIRE(dnda->has_metadata("soma_encoding_version")); REQUIRE(dnda->has_metadata("md")); @@ -192,7 +192,7 @@ TEST_CASE("SOMADenseNDArray: metadata", "[SOMADenseNDArray]") { // md should not be available at (0, 1) dnda->open(OpenMode::read, TimestampRange(0, 1)); - REQUIRE(dnda->metadata_num() == 2); + REQUIRE(dnda->metadata_num() == 3); REQUIRE(dnda->has_metadata("soma_object_type")); REQUIRE(dnda->has_metadata("soma_encoding_version")); REQUIRE(!dnda->has_metadata("md")); @@ -200,7 +200,7 @@ TEST_CASE("SOMADenseNDArray: metadata", "[SOMADenseNDArray]") { // Metadata should also be retrievable in write mode dnda->open(OpenMode::write); - REQUIRE(dnda->metadata_num() == 3); + REQUIRE(dnda->metadata_num() == 4); REQUIRE(dnda->has_metadata("soma_object_type")); REQUIRE(dnda->has_metadata("soma_encoding_version")); REQUIRE(dnda->has_metadata("md")); @@ -216,5 +216,5 @@ TEST_CASE("SOMADenseNDArray: metadata", "[SOMADenseNDArray]") { // Confirm delete in read mode dnda->open(OpenMode::read); REQUIRE(!dnda->has_metadata("md")); - REQUIRE(dnda->metadata_num() == 2); + REQUIRE(dnda->metadata_num() == 3); } diff --git a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc index 24cf9b4f0b..95fd596dff 100644 --- a/libtiledbsoma/test/unit_soma_sparse_ndarray.cc +++ b/libtiledbsoma/test/unit_soma_sparse_ndarray.cc @@ -239,7 +239,7 @@ TEST_CASE("SOMASparseNDArray: metadata", "[SOMASparseNDArray]") { // Read metadata snda->open(OpenMode::read, TimestampRange(0, 2)); - REQUIRE(snda->metadata_num() == 3); + REQUIRE(snda->metadata_num() == 4); REQUIRE(snda->has_metadata("soma_object_type")); REQUIRE(snda->has_metadata("soma_encoding_version")); REQUIRE(snda->has_metadata("md")); @@ -251,7 +251,7 @@ TEST_CASE("SOMASparseNDArray: metadata", "[SOMASparseNDArray]") { // md should not be available at (0, 1) snda->open(OpenMode::read, TimestampRange(0, 1)); - REQUIRE(snda->metadata_num() == 2); + REQUIRE(snda->metadata_num() == 3); REQUIRE(snda->has_metadata("soma_object_type")); REQUIRE(snda->has_metadata("soma_encoding_version")); REQUIRE(!snda->has_metadata("md")); @@ -259,7 +259,7 @@ TEST_CASE("SOMASparseNDArray: metadata", "[SOMASparseNDArray]") { // Metadata should also be retrievable in write mode snda->open(OpenMode::write); - REQUIRE(snda->metadata_num() == 3); + REQUIRE(snda->metadata_num() == 4); REQUIRE(snda->has_metadata("soma_object_type")); REQUIRE(snda->has_metadata("soma_encoding_version")); REQUIRE(snda->has_metadata("md")); @@ -269,7 +269,7 @@ TEST_CASE("SOMASparseNDArray: metadata", "[SOMASparseNDArray]") { // Delete and have it reflected when reading metadata while in write mode snda->delete_metadata("md"); REQUIRE(!snda->has_metadata("md")); - REQUIRE(snda->metadata_num() == 2); + REQUIRE(snda->metadata_num() == 3); mdval = snda->get_metadata("md"); REQUIRE(!mdval.has_value()); snda->close(); @@ -277,7 +277,7 @@ TEST_CASE("SOMASparseNDArray: metadata", "[SOMASparseNDArray]") { // Confirm delete in read mode snda->open(OpenMode::read); REQUIRE(!snda->has_metadata("md")); - REQUIRE(snda->metadata_num() == 2); + REQUIRE(snda->metadata_num() == 3); } TEST_CASE( From 2f9aad670f24659d518a1181b22ae51fea260213 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Sat, 18 Jan 2025 01:13:10 +0200 Subject: [PATCH 10/16] Generate columns on array open --- libtiledbsoma/src/soma/soma_array.cc | 52 ++++++++++++++++++++-------- libtiledbsoma/src/soma/soma_array.h | 3 +- 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 28734b3fd6..3581c07e71 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -146,6 +146,7 @@ SOMAArray::SOMAArray( validate(mode, name, timestamp); reset(column_names, batch_size, result_order); fill_metadata_cache(timestamp); + fill_columns(); } SOMAArray::SOMAArray( @@ -164,6 +165,7 @@ SOMAArray::SOMAArray( validate(mode, name, timestamp); reset(column_names, batch_size, result_order); fill_metadata_cache(timestamp); + fill_columns(); } SOMAArray::SOMAArray( @@ -180,6 +182,7 @@ SOMAArray::SOMAArray( , schema_(std::make_shared(arr->schema())) { reset({}, batch_size_, result_order_); fill_metadata_cache(timestamp); + fill_columns(); } void SOMAArray::fill_metadata_cache(std::optional timestamp) { @@ -214,22 +217,43 @@ void SOMAArray::fill_metadata_cache(std::optional timestamp) { } } -void SOMAArray::fill_columns(const nlohmann::json& soma_schema_extension) { - if (!soma_schema_extension.contains(TDB_SOMA_SCHEMA_COL_KEY)) { - LOG_WARN(std::format( - "[SOMAArray][fill_columns] Missing '{}' key from '{}'", - TDB_SOMA_SCHEMA_COL_KEY, - TDB_SOMA_SCHEMA_KEY)); - } +void SOMAArray::fill_columns() { + columns_.clear(); + + if (!has_metadata(TDB_SOMA_SCHEMA_KEY)) { + LOG_DEBUG(std::format( + "[SOMAArray][fill_columns] Generating '{}' metadata for existing " + "array '{}'", + TDB_SOMA_SCHEMA_KEY, + uri())); - soma_schema_extension.value( - TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()); + columns_ = SOMAColumn::deserialize( + nlohmann::json::array(), *ctx_->tiledb_ctx(), *arr_); + } else { + auto soma_schema_extension_raw = get_metadata(TDB_SOMA_SCHEMA_KEY) + .value(); + auto data = static_cast( + std::get(soma_schema_extension_raw)); + auto soma_schema_extension = data != nullptr ? + nlohmann::json::parse(std::string( + data, + std::get( + soma_schema_extension_raw))) : + nlohmann::json::object(); + + if (!soma_schema_extension.contains(TDB_SOMA_SCHEMA_COL_KEY)) { + LOG_WARN(std::format( + "[SOMAArray][fill_columns] Missing '{}' key from '{}'", + TDB_SOMA_SCHEMA_COL_KEY, + TDB_SOMA_SCHEMA_KEY)); + } - columns_ = SOMAColumn::deserialize( - soma_schema_extension.value( - TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()), - *ctx_->tiledb_ctx(), - *arr_); + columns_ = SOMAColumn::deserialize( + soma_schema_extension.value( + TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()), + *ctx_->tiledb_ctx(), + *arr_); + } } const std::string SOMAArray::uri() const { diff --git a/libtiledbsoma/src/soma/soma_array.h b/libtiledbsoma/src/soma/soma_array.h index 88baf62de2..ea11b28b56 100644 --- a/libtiledbsoma/src/soma/soma_array.h +++ b/libtiledbsoma/src/soma/soma_array.h @@ -229,6 +229,7 @@ class SOMAArray : public SOMAObject { , first_read_next_(other.first_read_next_) , submitted_(other.submitted_) { fill_metadata_cache(timestamp_); + fill_columns(); } SOMAArray( @@ -1540,7 +1541,7 @@ class SOMAArray : public SOMAObject { void fill_metadata_cache(std::optional timestamp); - void fill_columns(const nlohmann::json& soma_schema_extension); + void fill_columns(); // SOMAArray URI std::string uri_; From bb2bbac9b4e0954e8bf47a2d9ac872bfadbc6057 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Tue, 21 Jan 2025 18:29:39 +0200 Subject: [PATCH 11/16] Add deserialization and default initialization on array open --- libtiledbsoma/src/soma/soma_array.cc | 45 ++++++++++++++++--- libtiledbsoma/src/soma/soma_column.h | 6 +-- libtiledbsoma/src/soma/soma_dimension.cc | 2 +- .../src/soma/soma_geometry_column.cc | 2 +- 4 files changed, 43 insertions(+), 12 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 3581c07e71..a036431c8c 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -220,6 +220,8 @@ void SOMAArray::fill_metadata_cache(std::optional timestamp) { void SOMAArray::fill_columns() { columns_.clear(); + bool generate_metadata = false; + if (!has_metadata(TDB_SOMA_SCHEMA_KEY)) { LOG_DEBUG(std::format( "[SOMAArray][fill_columns] Generating '{}' metadata for existing " @@ -227,8 +229,7 @@ void SOMAArray::fill_columns() { TDB_SOMA_SCHEMA_KEY, uri())); - columns_ = SOMAColumn::deserialize( - nlohmann::json::array(), *ctx_->tiledb_ctx(), *arr_); + generate_metadata = true; } else { auto soma_schema_extension_raw = get_metadata(TDB_SOMA_SCHEMA_KEY) .value(); @@ -242,17 +243,47 @@ void SOMAArray::fill_columns() { nlohmann::json::object(); if (!soma_schema_extension.contains(TDB_SOMA_SCHEMA_COL_KEY)) { - LOG_WARN(std::format( + LOG_DEBUG(std::format( "[SOMAArray][fill_columns] Missing '{}' key from '{}'", TDB_SOMA_SCHEMA_COL_KEY, TDB_SOMA_SCHEMA_KEY)); + + generate_metadata = true; + } else { + columns_ = SOMAColumn::deserialize( + soma_schema_extension.value( + TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()), + *ctx_->tiledb_ctx(), + *arr_); } + } + if (generate_metadata) { columns_ = SOMAColumn::deserialize( - soma_schema_extension.value( - TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()), - *ctx_->tiledb_ctx(), - *arr_); + nlohmann::json::array(), *ctx_->tiledb_ctx(), *arr_); + + nlohmann::json soma_schema_extension = { + {TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()}, + {"version", TDB_SOMA_SCHEMA_VERSION}, + }; + + std::for_each( + columns_.cbegin(), + columns_.cend(), + [&soma_schema_extension]( + const std::shared_ptr& column) { + column->serialize( + soma_schema_extension[TDB_SOMA_SCHEMA_COL_KEY]); + }); + + LOG_DEBUG("[SOMAArray][fill_columns] Writing generated metadata"); + + auto soma_schema_extension_str = soma_schema_extension.dump(); + set_metadata( + TDB_SOMA_SCHEMA_KEY, + TILEDB_STRING_UTF8, + soma_schema_extension_str.length(), + soma_schema_extension_str.c_str()); } } diff --git a/libtiledbsoma/src/soma/soma_column.h b/libtiledbsoma/src/soma/soma_column.h index 8699aa2e12..0a7a815d97 100644 --- a/libtiledbsoma/src/soma/soma_column.h +++ b/libtiledbsoma/src/soma/soma_column.h @@ -53,9 +53,6 @@ namespace tiledbsoma { using namespace tiledb; class SOMAColumn { - typedef std::shared_ptr (*Factory)( - const nlohmann::json&, const Context&, const Array&); - public: //=================================================================== //= public static @@ -517,6 +514,9 @@ class SOMAColumn { virtual std::any _core_current_domain_slot(NDRectangle& ndrect) const = 0; private: + typedef std::shared_ptr (*Factory)( + const nlohmann::json&, const Context&, const Array&); + static std::map deserialiser_map; }; diff --git a/libtiledbsoma/src/soma/soma_dimension.cc b/libtiledbsoma/src/soma/soma_dimension.cc index 99167cee09..8d19ca2854 100644 --- a/libtiledbsoma/src/soma/soma_dimension.cc +++ b/libtiledbsoma/src/soma/soma_dimension.cc @@ -36,7 +36,7 @@ namespace tiledbsoma { std::shared_ptr SOMADimension::deserialize( - const nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + const nlohmann::json& soma_schema, const Context&, const Array& array) { if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_DIM_KEY)) { throw TileDBSOMAError( "[SOMADimension][deserialize] Missing required field " diff --git a/libtiledbsoma/src/soma/soma_geometry_column.cc b/libtiledbsoma/src/soma/soma_geometry_column.cc index f4c47f6617..45c98f93d2 100644 --- a/libtiledbsoma/src/soma/soma_geometry_column.cc +++ b/libtiledbsoma/src/soma/soma_geometry_column.cc @@ -34,7 +34,7 @@ namespace tiledbsoma { std::shared_ptr SOMAGeometryColumn::deserialize( - const nlohmann::json& soma_schema, const Context& ctx, const Array& array) { + const nlohmann::json& soma_schema, const Context&, const Array& array) { if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_DIM_KEY)) { throw TileDBSOMAError( "[SOMAGeometryColumn][deserialize] Missing required field " From 9957ed57c5aa73be59b7abe18d132343d1e85531 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Tue, 21 Jan 2025 21:44:48 +0200 Subject: [PATCH 12/16] Write SOMAColumn metadata if array is open in `write` mode --- libtiledbsoma/src/soma/soma_array.cc | 59 ++++++++++++++++------------ 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index a036431c8c..9b6f7609a8 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -224,10 +224,8 @@ void SOMAArray::fill_columns() { if (!has_metadata(TDB_SOMA_SCHEMA_KEY)) { LOG_DEBUG(std::format( - "[SOMAArray][fill_columns] Generating '{}' metadata for existing " - "array '{}'", - TDB_SOMA_SCHEMA_KEY, - uri())); + "[SOMAArray][fill_columns] Missing '{}' metadata key", + TDB_SOMA_SCHEMA_KEY)); generate_metadata = true; } else { @@ -262,28 +260,37 @@ void SOMAArray::fill_columns() { columns_ = SOMAColumn::deserialize( nlohmann::json::array(), *ctx_->tiledb_ctx(), *arr_); - nlohmann::json soma_schema_extension = { - {TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()}, - {"version", TDB_SOMA_SCHEMA_VERSION}, - }; - - std::for_each( - columns_.cbegin(), - columns_.cend(), - [&soma_schema_extension]( - const std::shared_ptr& column) { - column->serialize( - soma_schema_extension[TDB_SOMA_SCHEMA_COL_KEY]); - }); - - LOG_DEBUG("[SOMAArray][fill_columns] Writing generated metadata"); - - auto soma_schema_extension_str = soma_schema_extension.dump(); - set_metadata( - TDB_SOMA_SCHEMA_KEY, - TILEDB_STRING_UTF8, - soma_schema_extension_str.length(), - soma_schema_extension_str.c_str()); + if (mode() == OpenMode::write) { + LOG_DEBUG(std::format( + "[SOMAArray][fill_columns] Generating '{}' metadata for " + "existing " + "array '{}'", + TDB_SOMA_SCHEMA_KEY, + uri())); + + nlohmann::json soma_schema_extension = { + {TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()}, + {"version", TDB_SOMA_SCHEMA_VERSION}, + }; + + std::for_each( + columns_.cbegin(), + columns_.cend(), + [&soma_schema_extension]( + const std::shared_ptr& column) { + column->serialize( + soma_schema_extension[TDB_SOMA_SCHEMA_COL_KEY]); + }); + + LOG_DEBUG("[SOMAArray][fill_columns] Writing generated metadata"); + + auto soma_schema_extension_str = soma_schema_extension.dump(); + set_metadata( + TDB_SOMA_SCHEMA_KEY, + TILEDB_STRING_UTF8, + soma_schema_extension_str.length(), + soma_schema_extension_str.c_str()); + } } } From 63f27b1ee578bcd0a4763016e123cfd1bf129a27 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 22 Jan 2025 12:18:59 +0200 Subject: [PATCH 13/16] Write metadata directly to TileDB array --- libtiledbsoma/src/soma/soma_array.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libtiledbsoma/src/soma/soma_array.cc b/libtiledbsoma/src/soma/soma_array.cc index 9b6f7609a8..0683b43725 100644 --- a/libtiledbsoma/src/soma/soma_array.cc +++ b/libtiledbsoma/src/soma/soma_array.cc @@ -285,7 +285,7 @@ void SOMAArray::fill_columns() { LOG_DEBUG("[SOMAArray][fill_columns] Writing generated metadata"); auto soma_schema_extension_str = soma_schema_extension.dump(); - set_metadata( + arr_->put_metadata( TDB_SOMA_SCHEMA_KEY, TILEDB_STRING_UTF8, soma_schema_extension_str.length(), From 1251985521e3c8f5489e9c7e2ba0b6a62d431aab Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 22 Jan 2025 13:28:21 +0200 Subject: [PATCH 14/16] Fix error in tests after rebase --- libtiledbsoma/test/unit_soma_column.cc | 165 +------------------------ 1 file changed, 1 insertion(+), 164 deletions(-) diff --git a/libtiledbsoma/test/unit_soma_column.cc b/libtiledbsoma/test/unit_soma_column.cc index 6ee99990c0..960f8f1b8c 100644 --- a/libtiledbsoma/test/unit_soma_column.cc +++ b/libtiledbsoma/test/unit_soma_column.cc @@ -599,167 +599,4 @@ TEST_CASE_METHOD( REQUIRE(ext_res->num_rows() == 1); } -} - -TEST_CASE_METHOD( - VariouslyIndexedDataFrameFixture, - "SOMAColumn: query variant-indexed dataframe dim-str-u32 attr-sjid", - "[SOMADataFrame]") { - auto specify_domain = GENERATE(false, true); - SECTION(std::format("- specify_domain={}", specify_domain)) { - std::string suffix1 = specify_domain ? "true" : "false"; - set_up( - std::make_shared(), - "mem://unit-test-column-variant-indexed-dataframe-4-" + suffix1); - - std::string string_lo = ""; - std::string string_hi = ""; - std::vector dim_infos( - {str_dim_info(string_lo, string_hi), u32_dim_info()}); - std::vector attr_infos({i64_attr_info()}); - - // Create - create(dim_infos, attr_infos); - - // Check current domain - auto sdf = open(OpenMode::read); - - // External column initialization - auto raw_array = tiledb::Array(*ctx_->tiledb_ctx(), uri_, TILEDB_READ); - std::vector> columns; - - for (auto dimension : sdf->tiledb_schema()->domain().dimensions()) { - columns.push_back( - std::make_shared(SOMADimension(dimension))); - } - - for (size_t i = 0; i < sdf->tiledb_schema()->attribute_num(); ++i) { - columns.push_back(std::make_shared( - SOMAAttribute(sdf->tiledb_schema()->attribute(i)))); - } - - CurrentDomain current_domain = sdf->get_current_domain_for_test(); - - REQUIRE(!current_domain.is_empty()); - REQUIRE(current_domain.type() == TILEDB_NDRECTANGLE); - NDRectangle ndrect = current_domain.ndrectangle(); - - std::array str_range = ndrect.range( - dim_infos[0].name); - std::pair - str_external = columns[0]->core_current_domain_slot( - *ctx_, raw_array); - - // Can we write empty strings in this range? - REQUIRE(str_range[0] <= ""); - REQUIRE(str_external.first <= ""); - REQUIRE(str_range[1] >= ""); - REQUIRE(str_external.second >= ""); - // Can we write ASCII values in this range? - REQUIRE(str_range[0] < " "); - REQUIRE(str_external.first <= " "); - REQUIRE(str_range[1] > "~"); - // REQUIRE(str_external.second >= "~"); - - std::array u32_range = ndrect.range( - dim_infos[1].name); - std::pair - u32_external = columns[1]->core_current_domain_slot( - *ctx_, raw_array); - REQUIRE(u32_range[0] == u32_external.first); - REQUIRE(u32_range[1] == u32_external.second); - - // Check shape before write - std::optional actual = sdf->maybe_soma_joinid_shape(); - REQUIRE(!actual.has_value()); - - // Check domainish accessors before resize - ArrowTable non_empty_domain = sdf->get_non_empty_domain(); - std::vector - ned_str = ArrowAdapter::get_table_string_column_by_name( - non_empty_domain, "mystring"); - - std::vector - ned_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_non_empty_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); - - ArrowTable soma_domain = sdf->get_soma_domain(); - std::vector - dom_str = ArrowAdapter::get_table_string_column_by_name( - soma_domain, "mystring"); - - std::vector - dom_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_core_current_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); - - ArrowTable soma_maxdomain = sdf->get_soma_maxdomain(); - std::vector - maxdom_str = ArrowAdapter::get_table_string_column_by_name( - soma_maxdomain, "mystring"); - - std::vector - maxdom_str_col = ArrowAdapter::get_array_string_column( - columns[0]->arrow_domain_slot( - *ctx_, raw_array, Domainish::kind_core_domain), - columns[0]->arrow_schema_slot(*ctx_, raw_array)); - - REQUIRE(ned_str == std::vector({"", ""})); - - REQUIRE(ned_str == ned_str_col); - REQUIRE(dom_str == dom_str_col); - REQUIRE(maxdom_str == maxdom_str_col); - - if (specify_domain) { - REQUIRE(dom_str[0] == dim_infos[0].string_lo); - REQUIRE(dom_str[1] == dim_infos[0].string_hi); - } else { - REQUIRE(dom_str == std::vector({"", ""})); - } - REQUIRE(maxdom_str == std::vector({"", ""})); - - sdf->close(); - - sdf = open(OpenMode::write); - write_sjid_u32_str_data_from(0); - - sdf->close(); - - sdf = open(OpenMode::read); - REQUIRE(sdf->nnz() == 2); - - sdf->close(); - - auto external_query = std::make_unique( - open(OpenMode::read), ctx_->tiledb_ctx()); - - columns[1]->select_columns(external_query); - columns[1]->set_dim_point(external_query, *ctx_, 1234); - - // Configure query and allocate result buffers - external_query->setup_read(); - external_query->submit_read(); - auto ext_res = external_query->results(); - - REQUIRE(ext_res->num_rows() == 1); - - external_query->reset(); - - columns[0]->select_columns(external_query); - columns[0]->set_dim_ranges( - external_query, - *ctx_, - std::vector( - {std::make_pair("apple", "b")})); - - // Configure query and allocate result buffers - external_query->setup_read(); - external_query->submit_read(); - ext_res = external_query->results(); - - REQUIRE(ext_res->num_rows() == 1); - } -} +} \ No newline at end of file From ffc38dbbb2966236e511b4b0b330ae661d2461a9 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 22 Jan 2025 17:16:09 +0200 Subject: [PATCH 15/16] Handle addition and deletion of attributes --- libtiledbsoma/src/soma/soma_attribute.cc | 5 +++ libtiledbsoma/src/soma/soma_column.cc | 43 +++++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/libtiledbsoma/src/soma/soma_attribute.cc b/libtiledbsoma/src/soma/soma_attribute.cc index bae5ab6b62..4ea9e4081f 100644 --- a/libtiledbsoma/src/soma/soma_attribute.cc +++ b/libtiledbsoma/src/soma/soma_attribute.cc @@ -52,6 +52,11 @@ std::shared_ptr SOMAAttribute::deserialize( attribute_names.size())); } + if (!array.schema().has_attribute(attribute_names[0])) { + // Attribute probably dropped so skip column reconstruction. + return nullptr; + } + auto attribute = array.schema().attribute(attribute_names[0]); auto enumeration_name = AttributeExperimental::get_enumeration_name( ctx, attribute); diff --git a/libtiledbsoma/src/soma/soma_column.cc b/libtiledbsoma/src/soma/soma_column.cc index cc4b769600..1dc5cdae91 100644 --- a/libtiledbsoma/src/soma/soma_column.cc +++ b/libtiledbsoma/src/soma/soma_column.cc @@ -55,7 +55,48 @@ std::vector> SOMAColumn::deserialize( auto type = column[TDB_SOMA_SCHEMA_COL_TYPE_KEY] .template get(); - columns.push_back(deserialiser_map[type](column, ctx, array)); + auto col = deserialiser_map[type](column, ctx, array); + + if (col) { + // Deserialized column can be null in case the array is modified + // and the column no longer exists. + columns.push_back(deserialiser_map[type](column, ctx, array)); + } + } + + // Check for any newly added attributes + std::unordered_set used_attribute_names; + + std::for_each( + columns.cbegin(), + columns.cend(), + [&used_attribute_names](const std::shared_ptr& col) { + if (col->tiledb_attributes().has_value()) { + auto attributes = col->tiledb_attributes().value(); + for (const auto& attribute : attributes) { + used_attribute_names.insert(attribute.name()); + } + } + }); + + for (size_t i = 0; i < array.schema().attribute_num(); ++i) { + auto attribute = array.schema().attribute(i); + + // Attribute is already used by another attribute so we skip + if (used_attribute_names.contains(attribute.name())) { + continue; + } + + auto enumeration_name = AttributeExperimental::get_enumeration_name( + ctx, attribute); + auto enumeration = enumeration_name.has_value() ? + std::make_optional( + ArrayExperimental::get_enumeration( + ctx, array, attribute.name())) : + std::nullopt; + + columns.push_back( + std::make_shared(attribute, enumeration)); } } else { // All arrays before the introduction of SOMAColumn do not have From 2da74549166f609188cfec8357a052209eb854f0 Mon Sep 17 00:00:00 2001 From: XanthosXanthopoulos Date: Wed, 22 Jan 2025 18:39:10 +0200 Subject: [PATCH 16/16] Fix R tests --- apis/r/tests/testthat/test-04-TileDBArray.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apis/r/tests/testthat/test-04-TileDBArray.R b/apis/r/tests/testthat/test-04-TileDBArray.R index 4e6e5986b0..1afe09063e 100644 --- a/apis/r/tests/testthat/test-04-TileDBArray.R +++ b/apis/r/tests/testthat/test-04-TileDBArray.R @@ -33,7 +33,7 @@ test_that("TileDBArray helper functions", { tdba$open(mode = "READ", internal_use_only = "allowed_use") expect_equal(tdba$get_metadata(key = "int_column"), "float_column") expect_equal(tdba$get_metadata(key = "string_column"), "qux") - expect_equal(length(tdba$get_metadata()), 2) + expect_equal(length(tdba$get_metadata()), 3) tdba$close() # The SOMA spec requires the ability to read back metadata even when the @@ -41,7 +41,7 @@ test_that("TileDBArray helper functions", { tdba$open(mode = "WRITE", internal_use_only = "allowed_use") expect_equal(tdba$get_metadata(key = "int_column"), "float_column") expect_equal(tdba$get_metadata(key = "string_column"), "qux") - expect_equal(length(tdba$get_metadata()), 2) + expect_equal(length(tdba$get_metadata()), 3) tdba$close() ## shape