Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] SOMAColumn serialization/deserialization #3599

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions apis/r/tests/testthat/test-04-TileDBArray.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,15 @@ test_that("TileDBArray helper functions", {
tdba$open(mode = "READ", internal_use_only = "allowed_use")
expect_equal(tdba$get_metadata(key = "int_column"), "float_column")
expect_equal(tdba$get_metadata(key = "string_column"), "qux")
expect_equal(length(tdba$get_metadata()), 2)
expect_equal(length(tdba$get_metadata()), 3)
tdba$close()

# The SOMA spec requires the ability to read back metadata even when the
# array is opened for write.
tdba$open(mode = "WRITE", internal_use_only = "allowed_use")
expect_equal(tdba$get_metadata(key = "int_column"), "float_column")
expect_equal(tdba$get_metadata(key = "string_column"), "qux")
expect_equal(length(tdba$get_metadata()), 2)
expect_equal(length(tdba$get_metadata()), 3)
tdba$close()

## shape
Expand Down
87 changes: 87 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ void SOMAArray::create(
std::string_view uri,
ArraySchema schema,
std::string_view soma_type,
std::string_view soma_schema,
std::optional<TimestampRange> timestamp) {
Array::create(std::string(uri), schema);

Expand Down Expand Up @@ -73,6 +74,12 @@ void SOMAArray::create(
TILEDB_STRING_UTF8,
static_cast<uint32_t>(ENCODING_VERSION_VAL.length()),
ENCODING_VERSION_VAL.c_str());

array->put_metadata(
TDB_SOMA_SCHEMA_KEY,
TILEDB_STRING_UTF8,
static_cast<uint32_t>(soma_schema.length()),
soma_schema.data());
}

std::unique_ptr<SOMAArray> SOMAArray::open(
Expand Down Expand Up @@ -139,6 +146,7 @@ SOMAArray::SOMAArray(
validate(mode, name, timestamp);
reset(column_names, batch_size, result_order);
fill_metadata_cache(timestamp);
fill_columns();
}

SOMAArray::SOMAArray(
Expand All @@ -157,6 +165,7 @@ SOMAArray::SOMAArray(
validate(mode, name, timestamp);
reset(column_names, batch_size, result_order);
fill_metadata_cache(timestamp);
fill_columns();
}

SOMAArray::SOMAArray(
Expand All @@ -173,6 +182,7 @@ SOMAArray::SOMAArray(
, schema_(std::make_shared<ArraySchema>(arr->schema())) {
reset({}, batch_size_, result_order_);
fill_metadata_cache(timestamp);
fill_columns();
}

void SOMAArray::fill_metadata_cache(std::optional<TimestampRange> timestamp) {
Expand Down Expand Up @@ -207,6 +217,83 @@ void SOMAArray::fill_metadata_cache(std::optional<TimestampRange> timestamp) {
}
}

void SOMAArray::fill_columns() {
columns_.clear();

bool generate_metadata = false;

if (!has_metadata(TDB_SOMA_SCHEMA_KEY)) {
LOG_DEBUG(std::format(
"[SOMAArray][fill_columns] Missing '{}' metadata key",
TDB_SOMA_SCHEMA_KEY));

generate_metadata = true;
} else {
auto soma_schema_extension_raw = get_metadata(TDB_SOMA_SCHEMA_KEY)
.value();
auto data = static_cast<const char*>(
std::get<const void*>(soma_schema_extension_raw));
auto soma_schema_extension = data != nullptr ?
nlohmann::json::parse(std::string(
data,
std::get<uint32_t>(
soma_schema_extension_raw))) :
nlohmann::json::object();

if (!soma_schema_extension.contains(TDB_SOMA_SCHEMA_COL_KEY)) {
LOG_DEBUG(std::format(
"[SOMAArray][fill_columns] Missing '{}' key from '{}'",
TDB_SOMA_SCHEMA_COL_KEY,
TDB_SOMA_SCHEMA_KEY));

generate_metadata = true;
} else {
columns_ = SOMAColumn::deserialize(
soma_schema_extension.value(
TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()),
*ctx_->tiledb_ctx(),
*arr_);
}
}

if (generate_metadata) {
columns_ = SOMAColumn::deserialize(
nlohmann::json::array(), *ctx_->tiledb_ctx(), *arr_);

if (mode() == OpenMode::write) {
LOG_DEBUG(std::format(
"[SOMAArray][fill_columns] Generating '{}' metadata for "
"existing "
"array '{}'",
TDB_SOMA_SCHEMA_KEY,
uri()));

nlohmann::json soma_schema_extension = {
{TDB_SOMA_SCHEMA_COL_KEY, nlohmann::json::array()},
{"version", TDB_SOMA_SCHEMA_VERSION},
};

std::for_each(
columns_.cbegin(),
columns_.cend(),
[&soma_schema_extension](
const std::shared_ptr<SOMAColumn>& column) {
column->serialize(
soma_schema_extension[TDB_SOMA_SCHEMA_COL_KEY]);
});

LOG_DEBUG("[SOMAArray][fill_columns] Writing generated metadata");

auto soma_schema_extension_str = soma_schema_extension.dump();
arr_->put_metadata(
TDB_SOMA_SCHEMA_KEY,
TILEDB_STRING_UTF8,
soma_schema_extension_str.length(),
soma_schema_extension_str.c_str());
}
}
}

const std::string SOMAArray::uri() const {
return uri_;
};
Expand Down
8 changes: 8 additions & 0 deletions libtiledbsoma/src/soma/soma_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
#include "enums.h"
#include "logger_public.h"
#include "managed_query.h"
#include "soma_column.h"
#include "soma_object.h"

// ================================================================
Expand Down Expand Up @@ -112,6 +113,7 @@ class SOMAArray : public SOMAObject {
std::string_view uri,
ArraySchema schema,
std::string_view soma_type,
std::string_view soma_schema,
std::optional<TimestampRange> timestamp = std::nullopt);

/**
Expand Down Expand Up @@ -227,6 +229,7 @@ class SOMAArray : public SOMAObject {
, first_read_next_(other.first_read_next_)
, submitted_(other.submitted_) {
fill_metadata_cache(timestamp_);
fill_columns();
}

SOMAArray(
Expand Down Expand Up @@ -1538,6 +1541,8 @@ class SOMAArray : public SOMAObject {

void fill_metadata_cache(std::optional<TimestampRange> timestamp);

void fill_columns();

// SOMAArray URI
std::string uri_;

Expand All @@ -1560,6 +1565,9 @@ class SOMAArray : public SOMAObject {
// Metadata cache
std::map<std::string, MetadataValue> metadata_;

// SOMAColumn list
std::vector<std::shared_ptr<SOMAColumn>> columns_;

// Read timestamp range (start, end)
std::optional<TimestampRange> timestamp_;

Expand Down
47 changes: 47 additions & 0 deletions libtiledbsoma/src/soma/soma_attribute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,43 @@
#include "soma_attribute.h"

namespace tiledbsoma {
std::shared_ptr<SOMAColumn> SOMAAttribute::deserialize(
const nlohmann::json& soma_schema, const Context& ctx, const Array& array) {
if (!soma_schema.contains(TDB_SOMA_SCHEMA_COL_ATTR_KEY)) {
throw TileDBSOMAError(
"[SOMAAttribute][deserialize] Missing required field "
"'tiledb_attributes'");
}

std::vector<std::string>
attribute_names = soma_schema[TDB_SOMA_SCHEMA_COL_ATTR_KEY]
.template get<std::vector<std::string>>();

if (attribute_names.size() != 1) {
throw TileDBSOMAError(std::format(
"[SOMAAttribute][deserialize] Invalid number of attributes. "
"Epected 1, got {}",
attribute_names.size()));
}

if (!array.schema().has_attribute(attribute_names[0])) {
// Attribute probably dropped so skip column reconstruction.
return nullptr;
}

auto attribute = array.schema().attribute(attribute_names[0]);
auto enumeration_name = AttributeExperimental::get_enumeration_name(
ctx, attribute);

std::optional<Enumeration>
enumeration = enumeration_name ?
std::make_optional(ArrayExperimental::get_enumeration(
ctx, array, attribute.name())) :
std::nullopt;

return std::make_shared<SOMAAttribute>(attribute, enumeration);
}

std::shared_ptr<SOMAAttribute> SOMAAttribute::create(
std::shared_ptr<Context> ctx,
ArrowSchema* schema,
Expand Down Expand Up @@ -124,4 +161,14 @@ ArrowSchema* SOMAAttribute::arrow_schema_slot(
attribute, *ctx.tiledb_ctx(), array)
.release();
}

void SOMAAttribute::serialize(nlohmann::json& columns_schema) const {
nlohmann::json column;

column[TDB_SOMA_SCHEMA_COL_TYPE_KEY] = static_cast<uint32_t>(
soma_column_datatype_t::SOMA_COLUMN_ATTRIBUTE);
column[TDB_SOMA_SCHEMA_COL_ATTR_KEY] = {attribute.name()};

columns_schema.push_back(column);
}
} // namespace tiledbsoma
11 changes: 11 additions & 0 deletions libtiledbsoma/src/soma/soma_attribute.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ using namespace tiledb;

class SOMAAttribute : public SOMAColumn {
public:
//===================================================================
//= public static
//===================================================================

static std::shared_ptr<SOMAColumn> deserialize(
const nlohmann::json& soma_schema,
const Context& ctx,
const Array& array);

/**
* Create a ``SOMAAttribute`` shared pointer from an Arrow schema
*/
Expand Down Expand Up @@ -114,6 +123,8 @@ class SOMAAttribute : public SOMAColumn {
ArrowSchema* arrow_schema_slot(
const SOMAContext& ctx, Array& array) override;

void serialize(nlohmann::json&) const override;

private:
void _set_dim_points(
const std::unique_ptr<ManagedQuery>& query,
Expand Down
90 changes: 90 additions & 0 deletions libtiledbsoma/src/soma/soma_column.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,98 @@

#include "soma_column.h"

#include "soma_attribute.h"
#include "soma_dimension.h"
#include "soma_geometry_column.h"

namespace tiledbsoma {

std::map<uint32_t, SOMAColumn::Factory> SOMAColumn::deserialiser_map = {
{soma_column_datatype_t::SOMA_COLUMN_ATTRIBUTE, SOMAAttribute::deserialize},
{soma_column_datatype_t::SOMA_COLUMN_DIMENSION, SOMADimension::deserialize},
{soma_column_datatype_t::SOMA_COLUMN_GEOMETRY,
SOMAGeometryColumn::deserialize}};

std::vector<std::shared_ptr<SOMAColumn>> SOMAColumn::deserialize(
const nlohmann::json& soma_schema_columns,
const Context& ctx,
const Array& array) {
std::vector<std::shared_ptr<SOMAColumn>> columns;

if (!soma_schema_columns.empty()) {
for (auto& column : soma_schema_columns) {
auto type = column[TDB_SOMA_SCHEMA_COL_TYPE_KEY]
.template get<uint32_t>();

auto col = deserialiser_map[type](column, ctx, array);

if (col) {
// Deserialized column can be null in case the array is modified
// and the column no longer exists.
columns.push_back(deserialiser_map[type](column, ctx, array));
}
}

// Check for any newly added attributes
std::unordered_set<std::string> used_attribute_names;

std::for_each(
columns.cbegin(),
columns.cend(),
[&used_attribute_names](const std::shared_ptr<SOMAColumn>& col) {
if (col->tiledb_attributes().has_value()) {
auto attributes = col->tiledb_attributes().value();
for (const auto& attribute : attributes) {
used_attribute_names.insert(attribute.name());
}
}
});

for (size_t i = 0; i < array.schema().attribute_num(); ++i) {
auto attribute = array.schema().attribute(i);

// Attribute is already used by another attribute so we skip
if (used_attribute_names.contains(attribute.name())) {
continue;
}

auto enumeration_name = AttributeExperimental::get_enumeration_name(
ctx, attribute);
auto enumeration = enumeration_name.has_value() ?
std::make_optional(
ArrayExperimental::get_enumeration(
ctx, array, attribute.name())) :
std::nullopt;

columns.push_back(
std::make_shared<SOMAAttribute>(attribute, enumeration));
}
} else {
// All arrays before the introduction of SOMAColumn do not have
// composite columns, thus the metadata are trivially contructuble
for (auto& dimension : array.schema().domain().dimensions()) {
columns.push_back(std::make_shared<SOMADimension>(dimension));
}

for (auto& attribute : array.schema().attributes()) {
auto enumeration_name = AttributeExperimental::get_enumeration_name(
ctx, attribute.second);
auto enumeration = enumeration_name.has_value() ?
std::make_optional(
ArrayExperimental::get_enumeration(
ctx,
array,
attribute.second.name())) :
std::nullopt;

columns.push_back(
std::make_shared<SOMAAttribute>(attribute.second, enumeration));
}
}

return columns;
}

template <>
std::pair<std::string, std::string> SOMAColumn::core_domain_slot<std::string>()
const {
Expand Down
Loading