Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[c++] Readback-testing pieces for polytype domainish accessors #3018

Merged
merged 3 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 87 additions & 0 deletions libtiledbsoma/src/utils/arrow_adapter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1309,4 +1309,91 @@ std::unique_ptr<ArrowArray> ArrowAdapter::make_arrow_array_parent(
return arrow_array;
}

void ArrowAdapter::_check_shapes(
ArrowArray* arrow_array, ArrowSchema* arrow_schema) {
if (arrow_array->n_children != arrow_schema->n_children) {
throw std::runtime_error(
"ArrowAdapter::_check_shapes: internal coding error: data/schema "
"mismatch");
}
for (int64_t i = 0; i < arrow_array->n_children; i++) {
_check_shapes(arrow_array->children[i], arrow_schema->children[i]);
}
}

int64_t ArrowAdapter::_get_column_index_from_name(
const ArrowTable& arrow_table, std::string column_name) {
ArrowArray* arrow_array = arrow_table.first.get();
ArrowSchema* arrow_schema = arrow_table.second.get();
// Make sure the child-count is the same
_check_shapes(arrow_array, arrow_schema);

if (arrow_schema->n_children == 0) {
throw std::runtime_error(
"ArrowAdapter::_check_shapes: internal coding error: childless "
"table");
}

for (int64_t i = 0; i < arrow_schema->n_children; i++) {
if (strcmp(arrow_schema->children[i]->name, column_name.c_str()) == 0) {
return i;
}
}

throw std::runtime_error(fmt::format(
"ArrowAdapter::_check_shapes: column {} not found", column_name));
}

ArrowArray* ArrowAdapter::_get_and_check_column(
const ArrowTable& arrow_table,
int64_t column_index,
int64_t expected_n_buffers) {
ArrowArray* arrow_array = arrow_table.first.get();
if (column_index < 0 || column_index >= arrow_array->n_children) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: column index {} out of "
"bounds {}..{}",
column_index,
0,
arrow_array->n_children - 1));
}

ArrowArray* child = arrow_array->children[column_index];

if (child->n_children != 0) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: column index {} is "
"non-terminal",
column_index));
}

if (expected_n_buffers == 2) {
if (child->n_buffers != 2) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: column index {} "
"has buffer count {}; expected 2 for non-string data",
column_index,
child->n_buffers));
}

} else if (expected_n_buffers == 3) {
if (child->n_buffers != 3) {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: column index {} is "
"has buffer count {}; expected 3 for string data",
column_index,
child->n_buffers));
}

} else {
throw std::runtime_error(fmt::format(
"ArrowAdapter::_get_and_check_column: internal coding error: "
"expected_n_buffers {} is "
"neither 2 nor 3.",
expected_n_buffers));
}

return child;
}

} // namespace tiledbsoma
193 changes: 182 additions & 11 deletions libtiledbsoma/src/utils/arrow_adapter.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,8 +283,20 @@ class ArrowAdapter {
return make_arrow_array_child<T>(v);
}

static ArrowArray* make_arrow_array_child_string(
const std::pair<std::string, std::string>& pair) {
std::vector<std::string> v({pair.first, pair.second});
return make_arrow_array_child_string(v);
}

template <typename T>
static ArrowArray* make_arrow_array_child(const std::vector<T>& v) {
if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"ArrowAdapter::make_arrow_array_child: template-specialization "
"failure.");
}

// Use new here, not malloc, to match ArrowAdapter::release_array
auto arrow_array = new ArrowArray;

Expand All @@ -293,11 +305,24 @@ class ArrowAdapter {
arrow_array->length = n;
arrow_array->null_count = 0;
arrow_array->offset = 0;

// Two-buffer model for non-string data:
// * Slot 0 is the Arrow validity buffer which we leave null
// * Slot 1 is data, void* but will be derefrenced as T*
// * There is no offset information
arrow_array->n_buffers = 2;
arrow_array->release = &ArrowAdapter::release_array;
arrow_array->buffers = new const void*[2];
arrow_array->n_children = 0; // leaf/child node

// The nominal use of these methods as of this writing is for
// low-volume data such as schema information -- less than a
// kilobyte total. It's simplest and safest to do data copies,
// for-loop-wise. If we were to extend usage of these methods
// to bulk data in the megabyte/gigabyte range, we'd want to
// look at zero-copy for buffers, with variable approaches
// to memory management.
arrow_array->release = &ArrowAdapter::release_array;

arrow_array->buffers[0] = nullptr;
// Use malloc here, not new, to match ArrowAdapter::release_array
T* dest = (T*)malloc(n * sizeof(T));
Expand All @@ -309,30 +334,162 @@ class ArrowAdapter {
return arrow_array;
}

static ArrowArray* make_arrow_array_child(
// A nominal use of this method is for reporting core domain, current
// domain, and non-empty domain back to Python/R. Meanwhile core string
// dims must always have domain of (nullptr, nullptr); and they have current
// domain which must _not_ be nullptr pairs.
//
// For the former do we give back a column of length 2 with nulls in it,
// using Arrow's validity buffers? Or do we use ("", "") as TileDB-Py does?
//
// We choose the latter.
static ArrowArray* make_arrow_array_child_string(
const std::vector<std::string>& v) {
// Use new here, not malloc, to match ArrowAdapter::release_array
auto arrow_array = new ArrowArray;

size_t n = v.size();

arrow_array->length = n;
arrow_array->length = n; // Number of strings, not number of bytes
arrow_array->null_count = 0;
arrow_array->offset = 0;
arrow_array->n_buffers = 2;
arrow_array->release = &ArrowAdapter::release_array;
arrow_array->buffers = new const void*[2];

// Three-buffer model for string data:
// * Slot 0 is the Arrow uint8_t* validity buffer
// * Slot 1 is the Arrow offsets buffer: uint32_t* for Arrow string
// or uint64_t* for Arrow large_string
// * Slot 2 is data, void* but will be derefrenced as T*
arrow_array->n_buffers = 3;
arrow_array->buffers = new const void*[3];
arrow_array->n_children = 0; // leaf/child node

// For core domain, these are always nullptr for strings and cannot be
// anything else. More general use of this class is WIP on
// https://github.com/single-cell-data/TileDB-SOMA/issues/2407
arrow_array->buffers[0] = nullptr;
arrow_array->buffers[1] = nullptr;
arrow_array->release = &ArrowAdapter::release_array;

size_t nbytes = 0;
for (auto e : v) {
nbytes += e.length();
}

// This function produces arrow large_string, which has 64-bit offsets.
uint64_t* offsets = (uint64_t*)malloc((n + 1) * sizeof(uint64_t));

// Data
char* data = (char*)malloc(nbytes * sizeof(char));
uint64_t dest_start = 0;

offsets[0] = dest_start;
for (size_t i = 0; i < n; i++) {
const std::string& elem = v[i];
size_t elem_len = elem.size();

memcpy(&data[dest_start], elem.c_str(), elem_len);
dest_start += elem_len;
offsets[i + 1] = dest_start;
}

arrow_array->buffers[0] = nullptr; // validity
arrow_array->buffers[1] = offsets;
arrow_array->buffers[2] = data;

return arrow_array;
}

// These table-column getters are, as of this writing, intended primarily
// for keystroke-reduction in unit-test cases.

template <typename T>
static std::vector<T> get_table_column_by_name(
const ArrowTable& arrow_table, std::string column_name) {
int64_t index = _get_column_index_from_name(arrow_table, column_name);
return get_table_column_by_index<T>(arrow_table, index);
}

static std::vector<std::string> get_table_string_column_by_name(
const ArrowTable& arrow_table, std::string column_name) {
int64_t index = _get_column_index_from_name(arrow_table, column_name);
return get_table_string_column_by_index(arrow_table, index);
}

template <typename T>
static std::vector<T> get_table_column_by_index(
const ArrowTable& arrow_table, int64_t column_index) {
ArrowArray* arrow_array = arrow_table.first.get();
ArrowSchema* arrow_schema = arrow_table.second.get();
_check_shapes(arrow_array, arrow_schema);

if (std::is_same_v<T, std::string>) {
throw std::runtime_error(
"SOMAArray::_core_domain_slot: template-specialization "
"failure.");
}

ArrowArray* child = _get_and_check_column(arrow_table, column_index, 2);

// For our purposes -- reporting domains, etc. -- we don't use the Arrow
// validity buffers. If this class needs to be extended someday to
// support arrow-nulls, we can work on that.
if (child->buffers[0] != nullptr) {
throw std::runtime_error(
"ArrowAdapter::get_table_column_by_index: validity buffer "
"unsupported here");
}

const void* vdata = child->buffers[1];
if (vdata == nullptr) {
throw std::runtime_error(
"ArrowAdapter::get_table_column_by_index: null data buffer");
}

const T* data = (T*)vdata;

std::vector<T> retval(child->length);
for (auto i = 0; i < child->length; i++) {
retval[i] = data[i];
}
return retval;
}

static std::vector<std::string> get_table_string_column_by_index(
const ArrowTable& arrow_table, int64_t column_index) {
ArrowArray* arrow_array = arrow_table.first.get();
ArrowSchema* arrow_schema = arrow_table.second.get();
_check_shapes(arrow_array, arrow_schema);

ArrowArray* child = _get_and_check_column(arrow_table, column_index, 3);

// For our purposes -- reporting domains, etc. -- we don't use the Arrow
// validity buffers. If this class needs to be extended someday to
// support arrow-nulls, we can work on that.
if (child->buffers[0] != nullptr) {
throw std::runtime_error(
"ArrowAdapter::get_table_column_by_index: validity buffer "
"unsupported here");
}

const char* data = (char*)child->buffers[2];

if (data == nullptr) {
throw std::runtime_error(
"ArrowAdapter::get_table_column_by_index: null data buffer");
}

if (strcmp(arrow_schema->children[column_index]->format, "U") != 0) {
throw std::runtime_error(
"ArrowAdapter::get_table_column_by_index: expected Arrow "
"large_string");
}
uint64_t* offsets = (uint64_t*)child->buffers[1];

int num_cells = (int)child->length;
std::vector<std::string> retval(num_cells);
for (int j = 0; j < num_cells; j++) {
std::string e(&data[offsets[j]], &data[offsets[j + 1]]);
retval[j] = e;
}

return retval;
}

private:
static std::pair<const void*, std::size_t> _get_data_and_length(
Enumeration& enmr, const void* dst);
Expand Down Expand Up @@ -395,6 +552,20 @@ class ArrowAdapter {

static tiledb_layout_t _get_order(std::string order);

// Throws if the array and the schema don't have the same
// recursive child-counts.
static void _check_shapes(
ArrowArray* arrow_array, ArrowSchema* arrow_schema);

// Throws if the table doesn't have the column name.
static int64_t _get_column_index_from_name(
const ArrowTable& arrow_table, std::string column_name);

static ArrowArray* _get_and_check_column(
const ArrowTable& arrow_table,
int64_t column_index,
int64_t expected_n_buffers);

}; // class ArrowAdapter

}; // namespace tiledbsoma
Expand Down
1 change: 1 addition & 0 deletions libtiledbsoma/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ add_executable(unit_soma
$<TARGET_OBJECTS:TILEDBSOMA_NANOARROW_OBJECT>
common.cc
common.h
unit_arrow_adapter.cc
unit_column_buffer.cc
unit_managed_query.cc
unit_soma_array.cc
Expand Down
4 changes: 2 additions & 2 deletions libtiledbsoma/test/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,10 +198,10 @@ static std::unique_ptr<ArrowArray> _create_index_cols_info_array(
// handle this case.
if (info.use_current_domain) {
std::vector<std::string> dom({"", "", "", "", ""});
dim_array = ArrowAdapter::make_arrow_array_child(dom);
dim_array = ArrowAdapter::make_arrow_array_child_string(dom);
} else {
std::vector<std::string> dom({"", "", ""});
dim_array = ArrowAdapter::make_arrow_array_child(dom);
dim_array = ArrowAdapter::make_arrow_array_child_string(dom);
}
}

Expand Down
Loading
Loading