Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[r] Min-sizing for dataframes/arrays with new shape feature #3208

Merged
merged 2 commits into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apis/r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
like those commonly used for single cell data analysis. It is documented at
<https://github.com/single-cell-data>; a formal specification available is at
<https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md>.
Version: 1.15.99.8
Version: 1.15.99.9
Authors@R: c(
person(given = "Aaron", family = "Wolen",
role = c("cre", "aut"), email = "[email protected]",
Expand Down
1 change: 1 addition & 0 deletions apis/r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* Support for dense current domain with core 2.27 [#3180](https://github.com/single-cell-data/TileDB-SOMA/pull/3180)
* Fix `is_named_list` bug for half-named lists [#3183](https://github.com/single-cell-data/TileDB-SOMA/pull/3183)
* Expose block/random writer for sparse arrays [#3204](https://github.com/single-cell-data/TileDB-SOMA/pull/3204)
* Min-sizing for dataframes/arrays with new shape feature [#3208](https://github.com/single-cell-data/TileDB-SOMA/pull/3208)

# tiledbsoma 1.14.1

Expand Down
11 changes: 6 additions & 5 deletions apis/r/R/SOMACollectionBase.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,20 +94,21 @@ SOMACollectionBase <- R6::R6Class(
#' @param key The key to be added.
#' @param schema Arrow schema argument passed on to DataFrame$create()
#' @param index_column_names Index column names passed on to DataFrame$create()
#' @param domain As in ``SOMADataFrameCreate``.
#' @template param-platform-config
add_new_dataframe = function(key, schema, index_column_names, platform_config = NULL) {
add_new_dataframe = function(key, schema, index_column_names, domain, platform_config = NULL) {
## TODO: Check argument validity
ndf <- SOMADataFrame$new(
sdf <- SOMADataFrame$new(
uri = file_path(self$uri, key),
platform_config = platform_config %||% private$.tiledb_platform_config,
tiledbsoma_ctx = private$.tiledbsoma_ctx,
tiledb_timestamp = self$tiledb_timestamp, # Cached value from $new()/SOMACollectionOpen
internal_use_only = "allowed_use"
)

ndf$create(schema, index_column_names, internal_use_only = "allowed_use")
super$set(ndf, key)
ndf
sdf$create(schema, index_column_names=index_column_names, domain=domain, internal_use_only = "allowed_use")
super$set(sdf, key)
sdf
},

#' @description Add a new SOMA DenseNdArray to this collection. (lifecycle: maturing)
Expand Down
21 changes: 20 additions & 1 deletion apis/r/R/utils-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,26 @@ get_domain_and_extent_dataframe <- function(tbl_schema, ind_col_names, domain =

requested_slot <- domain[[ind_col_name]]
ind_cur_dom <- if (is.null(requested_slot)) {
ind_max_dom
if (.new_shape_feature_flag_is_enabled()) {
# New shape: if the slot is null, make the size as small
# as possible since current domain can only be resized upward.
#
# Core current-domain semantics are (lo, hi) with both
# inclusive, with lo <= hi. This means smallest is (0, 0)
# which is shape 1, not 0.
if (bit64::is.integer64(ind_max_dom)) {
c(bit64::as.integer64(0), bit64::as.integer64(0))
} else if (is.integer(ind_max_dom)) {
c(0L, 0L)
} else {
c(0, 0)
}
} else {
# Old shape: if the slot is null, make the size as large
# as possible since there is not current domain, and the
# max domain is immutable.
ind_max_dom
}
} else {
requested_slot
}
Expand Down
12 changes: 12 additions & 0 deletions apis/r/tests/testthat/helper-test-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,15 @@ create_arrow_table <- function(nrows = 10L, factors = FALSE) {
# schema = create_arrow_schema(false)
)
}

domain_for_arrow_table <- function() {
return(
list(
int_column = c(0, 1000000),
soma_joinid = c(0, 1000000),
float_column = c(-1e6, 1e6),
string_column = NULL,
grp = NULL
)
)
}
21 changes: 18 additions & 3 deletions apis/r/tests/testthat/helper-test-soma-objects.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,22 @@ create_and_populate_soma_dataframe <- function(
) {
set.seed(seed)

# arrow_schema <- create_arrow_schema()
tbl <- create_arrow_table(nrows = nrows, factors = factors)

sdf <- SOMADataFrameCreate(uri, tbl$schema, index_column_names = index_column_names)
full_domain <- domain_for_arrow_table()
# Pick out the index-column names actually being used in this case
domain <- list()
for (index_column in index_column_names) {
domain[[index_column]] <- full_domain[[index_column]]
}

sdf <- SOMADataFrameCreate(
uri,
tbl$schema,
index_column_names = index_column_names,
domain = domain
)

sdf$write(tbl)

if (is.null(mode)) {
Expand Down Expand Up @@ -67,11 +79,14 @@ create_and_populate_var <- function(
rep_len("lvl2", length.out = floor(nrows / 2))
))
}
domain <- list(
soma_joinid = c(0, nrows - 1L)
)

dname <- dirname(uri)
if (!dir.exists(dname)) dir.create(dname)

sdf <- SOMADataFrameCreate(uri, tbl$schema, index_column_names = "soma_joinid")
sdf <- SOMADataFrameCreate(uri, tbl$schema, index_column_names = "soma_joinid", domain = domain)
sdf$write(tbl)

if (is.null(mode)) {
Expand Down
34 changes: 28 additions & 6 deletions apis/r/tests/testthat/test-Factory.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,19 @@ test_that("DataFrame Factory", {

# Check creation of a DF
asch <- create_arrow_schema(foo_first=FALSE)
expect_silent(d2 <- SOMADataFrameCreate(uri, schema = asch))
tbl <- arrow::arrow_table(soma_joinid = 1L:10L, int_column = 1L:10L, float_column = sqrt(1:10),
string_column = letters[1:10], schema = asch)

expect_silent(d2 <- SOMADataFrameCreate(
uri,
schema = asch,
domain = list(soma_joinid = c(0, 99))
))

tbl <- arrow::arrow_table(
soma_joinid = 1L:10L,
int_column = 1L:10L,
float_column = sqrt(1:10),
string_column = letters[1:10],
schema = asch)
d2$write(tbl)

# Check opening to read
Expand All @@ -26,9 +36,21 @@ test_that("DataFrame Factory with specified index_column_names", {
# Check creation of a DF
asch <- create_arrow_schema()
expect_error(d2 <- SOMADataFrameCreate(uri, index_column_names = "int_column")) # misses schema
expect_silent(d2 <- SOMADataFrameCreate(uri, schema = asch, index_column_names = "int_column"))
tbl <- arrow::arrow_table(int_column = 1L:10L, soma_joinid = 1L:10L, float_column = sqrt(1:10),
string_column = letters[1:10], schema = asch)

expect_silent(d2 <- SOMADataFrameCreate(
uri,
schema = asch,
index_column_names = "int_column",
domain = list(int_column = c(1, 10))
))

tbl <- arrow::arrow_table(
int_column = 1L:10L,
soma_joinid = 1L:10L,
float_column = sqrt(1:10),
string_column = letters[1:10],
schema = asch)

d2$write(tbl)

# Check opening to read
Expand Down
2 changes: 1 addition & 1 deletion apis/r/tests/testthat/test-OrderedAndFactor.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ test_that("SOMADataFrame round-trip with factor and ordered", {
expect_equal(names(lvls), colnames(et))

#sdf <- SOMADataFrameCreate(uri, sch)
sdf <- SOMADataFrameCreate(uri, att$schema)
sdf <- SOMADataFrameCreate(uri, att$schema, domain = list(soma_joinid = c(0, 999)))
expect_true(inherits(sdf, "SOMADataFrame"))

sdf$write(att)
Expand Down
5 changes: 3 additions & 2 deletions apis/r/tests/testthat/test-SOMACollection.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ test_that("SOMACollection basics", {
subcollection$close()

# Add another dataframe to the collection, this time using add_new_dataframe
collection$add_new_dataframe("new_df", create_arrow_schema(), "int_column")$close()
collection$add_new_dataframe("new_df", create_arrow_schema(), "int_column", domain = list(int_column = c(0, 999)))$close()
df3 <- collection$get("new_df")
df3 <- SOMADataFrameOpen(df3$uri)
expect_true(df3$soma_type == "SOMADataFrame")
Expand Down Expand Up @@ -131,7 +131,7 @@ test_that("Platform config and context are respected by add_ methods", {

# Add a dataframe element to the collection
tbl <- create_arrow_table()
sdf1 <- collection$add_new_dataframe("sdf1", tbl$schema, "soma_joinid")
sdf1 <- collection$add_new_dataframe("sdf1", tbl$schema, "soma_joinid", domain = list(soma_joinid = c(0, 999)))
sdf1$write(tbl)
collection$close()

Expand All @@ -154,6 +154,7 @@ test_that("Platform config and context are respected by add_ methods", {
key = "sdf2",
schema = tbl$schema,
index_column_names = "soma_joinid",
domain = list(soma_joinid = c(0, 999)),
platform_config = cfg
)
sdf2$write(tbl)
Expand Down
33 changes: 23 additions & 10 deletions apis/r/tests/testthat/test-SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ test_that("Basic mechanics", {
)
if (dir.exists(uri)) unlink(uri, recursive=TRUE)

sdf <- SOMADataFrameCreate(uri, asch, index_column_names = "int_column")
sdf <- SOMADataFrameCreate(uri, asch, index_column_names = "int_column", domain = list(int_column = c(1, 36)))
expect_true(sdf$exists())
expect_true(dir.exists(uri))

Expand Down Expand Up @@ -127,7 +127,7 @@ test_that("Basic mechanics with default index_column_names", {
)
if (dir.exists(uri)) unlink(uri, recursive=TRUE)

sdf$create(asch, internal_use_only = "allowed_use")
sdf$create(asch, domain = list(soma_joinid = c(0, 99)), internal_use_only = "allowed_use")
expect_true(sdf$exists())
expect_true(dir.exists(uri))
expect_match(sdf$soma_type, "SOMADataFrame")
Expand Down Expand Up @@ -176,6 +176,15 @@ test_that("creation with all supported dimension data types", {
arrow::field("string", arrow::utf8(), nullable = FALSE)
)

domains <- list(
int8 = c(1L, 36L),
int16 = c(1L, 36L),
double = c(1.1, 36.1),
int = c(1L, 36L),
int64 = c(bit64::as.integer64(1L), bit64::as.integer64(36L)),
string = NULL
)

tbl0 <- arrow::arrow_table(
int8 = 1L:36L,
int16 = 1:36L,
Expand All @@ -192,7 +201,7 @@ test_that("creation with all supported dimension data types", {
for (dtype in tbl0$ColumnNames()) {
uri <- tempfile(pattern=paste0("soma-dataframe-", dtype))
expect_silent(
sdf <- SOMADataFrameCreate(uri, tbl0$schema, index_column_names = dtype)
sdf <- SOMADataFrameCreate(uri, tbl0$schema, index_column_names = dtype, domain = domains[dtype])
)
expect_true(sdf$exists())
sdf$close()
Expand All @@ -211,7 +220,7 @@ test_that("int64 values are stored correctly", {
)
if (dir.exists(uri)) unlink(uri, recursive=TRUE)

sdf <- SOMADataFrameCreate(uri, asch, index_column_names = "int_column")
sdf <- SOMADataFrameCreate(uri, asch, index_column_names = "int_column", domain = list(int_column = c(1, 10)))
tbl0 <- arrow::arrow_table(int_column = 1L:10L, soma_joinid = 1L:10L, schema = asch)

orig_downcast_value <- getOption("arrow.int64_downcast")
Expand Down Expand Up @@ -245,7 +254,9 @@ test_that("creation with ordered factors", {
tbl <- arrow::as_arrow_table(df)
expect_true(tbl$schema$GetFieldByName("ord")$type$ordered)
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
expect_no_condition(sdf <- SOMADataFrameCreate(uri = uri, schema = tbl$schema))
expect_no_condition(
sdf <- SOMADataFrameCreate(uri = uri, schema = tbl$schema, domain = list(soma_joinid = c(0, n-1L)))
)
expect_no_condition(sdf$write(values = tbl))
expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
expect_true(sdf$schema()$GetFieldByName("ord")$type$ordered)
Expand All @@ -270,7 +281,9 @@ test_that("explicit casting of ordered factors to regular factors", {
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
tbl <- arrow::as_arrow_table(df)
expect_true(tbl$schema$GetFieldByName("ord")$type$ordered)
expect_no_condition(sdf <- SOMADataFrameCreate(uri = uri, schema = tbl$schema,))
expect_no_condition(
sdf <- SOMADataFrameCreate(uri = uri, schema = tbl$schema, domain = list(soma_joinid = c(0, n-1L)))
)
expect_no_condition(sdf$write(values = tbl))
expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
expect_true(sdf$schema()$GetFieldByName("ord")$type$ordered)
Expand Down Expand Up @@ -577,7 +590,7 @@ test_that("SOMADataFrame timestamped ops", {
arrow::field("valint", arrow::int32(), nullable=FALSE),
arrow::field("valdbl", arrow::float64(), nullable=FALSE))
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
sdf <- SOMADataFrameCreate(uri=uri, schema=sch)
sdf <- SOMADataFrameCreate(uri=uri, schema=sch, domain = list(soma_joinid = c(1, 100)))
rb1 <- arrow::record_batch(soma_joinid = bit64::as.integer64(1L:3L),
valint = 1L:3L,
valdbl = 100*(1:3),
Expand Down Expand Up @@ -813,7 +826,7 @@ test_that("missing levels in enums", {
# Create SOMADataFrame w/ missing enum levels
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
tbl <- arrow::as_arrow_table(df)
sdf <- SOMADataFrameCreate(uri, tbl$schema)
sdf <- SOMADataFrameCreate(uri, tbl$schema, domain = list(soma_joinid = c(0, n-1)))
on.exit(sdf$close())
sdf$write(tbl)
sdf$close()
Expand Down Expand Up @@ -874,7 +887,7 @@ test_that("factor levels can grow without overlap", {
arrow::field(name = "obs_col_like",
type = arrow::dictionary(index_type = arrow::int8(), ordered = FALSE)))

sdf <- SOMADataFrameCreate(uri, schema)
sdf <- SOMADataFrameCreate(uri, schema, domain = list(soma_joinid = c(0, 5)))

tbl_1 <- arrow::arrow_table(soma_joinid = bit64::as.integer64(c(0,1,2)),
obs_col_like = factor(c("A", "B", "A")),
Expand Down Expand Up @@ -917,7 +930,7 @@ test_that("factor levels cannot extend beyond index limit", {
df <- data.frame(soma_joinid = bit64::as.integer64(seq_len(65)),
obs = factor(paste0("elem", seq_len(65))))
tbl <- arrow::as_arrow_table(df, schema = sch)
expect_silent(SOMADataFrameCreate(uri, sch)$write(tbl)$close())
expect_silent(SOMADataFrameCreate(uri, sch, domain = list(soma_joinid = c(0, 999)))$write(tbl)$close())

df2 <- data.frame(soma_joinid = bit64::as.integer64(65 + seq_len(65)),
obs = factor(paste0("elem_", 65 + seq_len(65))))
Expand Down
2 changes: 1 addition & 1 deletion apis/r/tests/testthat/test-Timestamps.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ test_that("SOMADataFrame", {

## create at t = 1
ts1 <- as.POSIXct(1, tz = "UTC", origin = "1970-01-01")
sdf <- tiledbsoma::SOMADataFrameCreate(uri, sch, tiledb_timestamp = ts1)
sdf <- tiledbsoma::SOMADataFrameCreate(uri, sch, tiledb_timestamp = ts1, domain = list(soma_joinid=c(0, 999)))

## write part1 at t = 2
dat2 <- arrow::arrow_table(soma_joinid = bit64::as.integer64(1L:5L),
Expand Down
8 changes: 7 additions & 1 deletion apis/r/tests/testthat/test-query-condition.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,13 @@ test_that("DataFrame Factory", {
# arrow::field("datetime_day", arrow::date32())
)

sdf <- SOMADataFrameCreate(uri, sch, index_column_names = "soma_joinid")
sdf <- SOMADataFrameCreate(
uri,
sch,
index_column_names = "soma_joinid",
domain = list(soma_joinid = c(0, 999))
)

expect_true(sdf$exists())
expect_true(dir.exists(uri))

Expand Down
3 changes: 2 additions & 1 deletion apis/r/tests/testthat/test-reopen.R
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ test_that("`reopen()` works on nested collections", {
soma_joinid = bit64::integer64(),
int = integer()
)),
index_column_names = "soma_joinid"
index_column_names = "soma_joinid",
domain = list(soma_joinid = c(0, 999))
)
expect_length(col$names(), 4L)

Expand Down
Loading