Skip to content

Commit

Permalink
[r] Min-sizing for dataframes/arrays with new shape feature (#3208)
Browse files Browse the repository at this point in the history
* [r] Min-sizing for dataframes/arrays [WIP]

* DESCRIPTION NEWS.md [skip ci]
  • Loading branch information
johnkerl authored Oct 20, 2024
1 parent 60bdcf5 commit 003fd1d
Show file tree
Hide file tree
Showing 14 changed files with 315 additions and 253 deletions.
2 changes: 1 addition & 1 deletion apis/r/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Description: Interface for working with 'TileDB'-based Stack of Matrices,
like those commonly used for single cell data analysis. It is documented at
<https://github.com/single-cell-data>; a formal specification available is at
<https://github.com/single-cell-data/SOMA/blob/main/abstract_specification.md>.
Version: 1.15.99.8
Version: 1.15.99.9
Authors@R: c(
person(given = "Aaron", family = "Wolen",
role = c("cre", "aut"), email = "[email protected]",
Expand Down
1 change: 1 addition & 0 deletions apis/r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* Support for dense current domain with core 2.27 [#3180](https://github.com/single-cell-data/TileDB-SOMA/pull/3180)
* Fix `is_named_list` bug for half-named lists [#3183](https://github.com/single-cell-data/TileDB-SOMA/pull/3183)
* Expose block/random writer for sparse arrays [#3204](https://github.com/single-cell-data/TileDB-SOMA/pull/3204)
* Min-sizing for dataframes/arrays with new shape feature [#3208](https://github.com/single-cell-data/TileDB-SOMA/pull/3208)

# tiledbsoma 1.14.1

Expand Down
11 changes: 6 additions & 5 deletions apis/r/R/SOMACollectionBase.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,20 +94,21 @@ SOMACollectionBase <- R6::R6Class(
#' @param key The key to be added.
#' @param schema Arrow schema argument passed on to DataFrame$create()
#' @param index_column_names Index column names passed on to DataFrame$create()
#' @param domain As in ``SOMADataFrameCreate``.
#' @template param-platform-config
add_new_dataframe = function(key, schema, index_column_names, platform_config = NULL) {
add_new_dataframe = function(key, schema, index_column_names, domain, platform_config = NULL) {
## TODO: Check argument validity
ndf <- SOMADataFrame$new(
sdf <- SOMADataFrame$new(
uri = file_path(self$uri, key),
platform_config = platform_config %||% private$.tiledb_platform_config,
tiledbsoma_ctx = private$.tiledbsoma_ctx,
tiledb_timestamp = self$tiledb_timestamp, # Cached value from $new()/SOMACollectionOpen
internal_use_only = "allowed_use"
)

ndf$create(schema, index_column_names, internal_use_only = "allowed_use")
super$set(ndf, key)
ndf
sdf$create(schema, index_column_names=index_column_names, domain=domain, internal_use_only = "allowed_use")
super$set(sdf, key)
sdf
},

#' @description Add a new SOMA DenseNdArray to this collection. (lifecycle: maturing)
Expand Down
21 changes: 20 additions & 1 deletion apis/r/R/utils-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,26 @@ get_domain_and_extent_dataframe <- function(tbl_schema, ind_col_names, domain =

requested_slot <- domain[[ind_col_name]]
ind_cur_dom <- if (is.null(requested_slot)) {
ind_max_dom
if (.new_shape_feature_flag_is_enabled()) {
# New shape: if the slot is null, make the size as small
# as possible since current domain can only be resized upward.
#
# Core current-domain semantics are (lo, hi) with both
# inclusive, with lo <= hi. This means smallest is (0, 0)
# which is shape 1, not 0.
if (bit64::is.integer64(ind_max_dom)) {
c(bit64::as.integer64(0), bit64::as.integer64(0))
} else if (is.integer(ind_max_dom)) {
c(0L, 0L)
} else {
c(0, 0)
}
} else {
# Old shape: if the slot is null, make the size as large
# as possible since there is not current domain, and the
# max domain is immutable.
ind_max_dom
}
} else {
requested_slot
}
Expand Down
12 changes: 12 additions & 0 deletions apis/r/tests/testthat/helper-test-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,15 @@ create_arrow_table <- function(nrows = 10L, factors = FALSE) {
# schema = create_arrow_schema(false)
)
}

domain_for_arrow_table <- function() {
return(
list(
int_column = c(0, 1000000),
soma_joinid = c(0, 1000000),
float_column = c(-1e6, 1e6),
string_column = NULL,
grp = NULL
)
)
}
21 changes: 18 additions & 3 deletions apis/r/tests/testthat/helper-test-soma-objects.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,22 @@ create_and_populate_soma_dataframe <- function(
) {
set.seed(seed)

# arrow_schema <- create_arrow_schema()
tbl <- create_arrow_table(nrows = nrows, factors = factors)

sdf <- SOMADataFrameCreate(uri, tbl$schema, index_column_names = index_column_names)
full_domain <- domain_for_arrow_table()
# Pick out the index-column names actually being used in this case
domain <- list()
for (index_column in index_column_names) {
domain[[index_column]] <- full_domain[[index_column]]
}

sdf <- SOMADataFrameCreate(
uri,
tbl$schema,
index_column_names = index_column_names,
domain = domain
)

sdf$write(tbl)

if (is.null(mode)) {
Expand Down Expand Up @@ -67,11 +79,14 @@ create_and_populate_var <- function(
rep_len("lvl2", length.out = floor(nrows / 2))
))
}
domain <- list(
soma_joinid = c(0, nrows - 1L)
)

dname <- dirname(uri)
if (!dir.exists(dname)) dir.create(dname)

sdf <- SOMADataFrameCreate(uri, tbl$schema, index_column_names = "soma_joinid")
sdf <- SOMADataFrameCreate(uri, tbl$schema, index_column_names = "soma_joinid", domain = domain)
sdf$write(tbl)

if (is.null(mode)) {
Expand Down
34 changes: 28 additions & 6 deletions apis/r/tests/testthat/test-Factory.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,19 @@ test_that("DataFrame Factory", {

# Check creation of a DF
asch <- create_arrow_schema(foo_first=FALSE)
expect_silent(d2 <- SOMADataFrameCreate(uri, schema = asch))
tbl <- arrow::arrow_table(soma_joinid = 1L:10L, int_column = 1L:10L, float_column = sqrt(1:10),
string_column = letters[1:10], schema = asch)

expect_silent(d2 <- SOMADataFrameCreate(
uri,
schema = asch,
domain = list(soma_joinid = c(0, 99))
))

tbl <- arrow::arrow_table(
soma_joinid = 1L:10L,
int_column = 1L:10L,
float_column = sqrt(1:10),
string_column = letters[1:10],
schema = asch)
d2$write(tbl)

# Check opening to read
Expand All @@ -26,9 +36,21 @@ test_that("DataFrame Factory with specified index_column_names", {
# Check creation of a DF
asch <- create_arrow_schema()
expect_error(d2 <- SOMADataFrameCreate(uri, index_column_names = "int_column")) # misses schema
expect_silent(d2 <- SOMADataFrameCreate(uri, schema = asch, index_column_names = "int_column"))
tbl <- arrow::arrow_table(int_column = 1L:10L, soma_joinid = 1L:10L, float_column = sqrt(1:10),
string_column = letters[1:10], schema = asch)

expect_silent(d2 <- SOMADataFrameCreate(
uri,
schema = asch,
index_column_names = "int_column",
domain = list(int_column = c(1, 10))
))

tbl <- arrow::arrow_table(
int_column = 1L:10L,
soma_joinid = 1L:10L,
float_column = sqrt(1:10),
string_column = letters[1:10],
schema = asch)

d2$write(tbl)

# Check opening to read
Expand Down
2 changes: 1 addition & 1 deletion apis/r/tests/testthat/test-OrderedAndFactor.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ test_that("SOMADataFrame round-trip with factor and ordered", {
expect_equal(names(lvls), colnames(et))

#sdf <- SOMADataFrameCreate(uri, sch)
sdf <- SOMADataFrameCreate(uri, att$schema)
sdf <- SOMADataFrameCreate(uri, att$schema, domain = list(soma_joinid = c(0, 999)))
expect_true(inherits(sdf, "SOMADataFrame"))

sdf$write(att)
Expand Down
5 changes: 3 additions & 2 deletions apis/r/tests/testthat/test-SOMACollection.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ test_that("SOMACollection basics", {
subcollection$close()

# Add another dataframe to the collection, this time using add_new_dataframe
collection$add_new_dataframe("new_df", create_arrow_schema(), "int_column")$close()
collection$add_new_dataframe("new_df", create_arrow_schema(), "int_column", domain = list(int_column = c(0, 999)))$close()
df3 <- collection$get("new_df")
df3 <- SOMADataFrameOpen(df3$uri)
expect_true(df3$soma_type == "SOMADataFrame")
Expand Down Expand Up @@ -131,7 +131,7 @@ test_that("Platform config and context are respected by add_ methods", {

# Add a dataframe element to the collection
tbl <- create_arrow_table()
sdf1 <- collection$add_new_dataframe("sdf1", tbl$schema, "soma_joinid")
sdf1 <- collection$add_new_dataframe("sdf1", tbl$schema, "soma_joinid", domain = list(soma_joinid = c(0, 999)))
sdf1$write(tbl)
collection$close()

Expand All @@ -154,6 +154,7 @@ test_that("Platform config and context are respected by add_ methods", {
key = "sdf2",
schema = tbl$schema,
index_column_names = "soma_joinid",
domain = list(soma_joinid = c(0, 999)),
platform_config = cfg
)
sdf2$write(tbl)
Expand Down
33 changes: 23 additions & 10 deletions apis/r/tests/testthat/test-SOMADataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ test_that("Basic mechanics", {
)
if (dir.exists(uri)) unlink(uri, recursive=TRUE)

sdf <- SOMADataFrameCreate(uri, asch, index_column_names = "int_column")
sdf <- SOMADataFrameCreate(uri, asch, index_column_names = "int_column", domain = list(int_column = c(1, 36)))
expect_true(sdf$exists())
expect_true(dir.exists(uri))

Expand Down Expand Up @@ -127,7 +127,7 @@ test_that("Basic mechanics with default index_column_names", {
)
if (dir.exists(uri)) unlink(uri, recursive=TRUE)

sdf$create(asch, internal_use_only = "allowed_use")
sdf$create(asch, domain = list(soma_joinid = c(0, 99)), internal_use_only = "allowed_use")
expect_true(sdf$exists())
expect_true(dir.exists(uri))
expect_match(sdf$soma_type, "SOMADataFrame")
Expand Down Expand Up @@ -176,6 +176,15 @@ test_that("creation with all supported dimension data types", {
arrow::field("string", arrow::utf8(), nullable = FALSE)
)

domains <- list(
int8 = c(1L, 36L),
int16 = c(1L, 36L),
double = c(1.1, 36.1),
int = c(1L, 36L),
int64 = c(bit64::as.integer64(1L), bit64::as.integer64(36L)),
string = NULL
)

tbl0 <- arrow::arrow_table(
int8 = 1L:36L,
int16 = 1:36L,
Expand All @@ -192,7 +201,7 @@ test_that("creation with all supported dimension data types", {
for (dtype in tbl0$ColumnNames()) {
uri <- tempfile(pattern=paste0("soma-dataframe-", dtype))
expect_silent(
sdf <- SOMADataFrameCreate(uri, tbl0$schema, index_column_names = dtype)
sdf <- SOMADataFrameCreate(uri, tbl0$schema, index_column_names = dtype, domain = domains[dtype])
)
expect_true(sdf$exists())
sdf$close()
Expand All @@ -211,7 +220,7 @@ test_that("int64 values are stored correctly", {
)
if (dir.exists(uri)) unlink(uri, recursive=TRUE)

sdf <- SOMADataFrameCreate(uri, asch, index_column_names = "int_column")
sdf <- SOMADataFrameCreate(uri, asch, index_column_names = "int_column", domain = list(int_column = c(1, 10)))
tbl0 <- arrow::arrow_table(int_column = 1L:10L, soma_joinid = 1L:10L, schema = asch)

orig_downcast_value <- getOption("arrow.int64_downcast")
Expand Down Expand Up @@ -245,7 +254,9 @@ test_that("creation with ordered factors", {
tbl <- arrow::as_arrow_table(df)
expect_true(tbl$schema$GetFieldByName("ord")$type$ordered)
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
expect_no_condition(sdf <- SOMADataFrameCreate(uri = uri, schema = tbl$schema))
expect_no_condition(
sdf <- SOMADataFrameCreate(uri = uri, schema = tbl$schema, domain = list(soma_joinid = c(0, n-1L)))
)
expect_no_condition(sdf$write(values = tbl))
expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
expect_true(sdf$schema()$GetFieldByName("ord")$type$ordered)
Expand All @@ -270,7 +281,9 @@ test_that("explicit casting of ordered factors to regular factors", {
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
tbl <- arrow::as_arrow_table(df)
expect_true(tbl$schema$GetFieldByName("ord")$type$ordered)
expect_no_condition(sdf <- SOMADataFrameCreate(uri = uri, schema = tbl$schema,))
expect_no_condition(
sdf <- SOMADataFrameCreate(uri = uri, schema = tbl$schema, domain = list(soma_joinid = c(0, n-1L)))
)
expect_no_condition(sdf$write(values = tbl))
expect_s3_class(sdf <- SOMADataFrameOpen(uri), "SOMADataFrame")
expect_true(sdf$schema()$GetFieldByName("ord")$type$ordered)
Expand Down Expand Up @@ -577,7 +590,7 @@ test_that("SOMADataFrame timestamped ops", {
arrow::field("valint", arrow::int32(), nullable=FALSE),
arrow::field("valdbl", arrow::float64(), nullable=FALSE))
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
sdf <- SOMADataFrameCreate(uri=uri, schema=sch)
sdf <- SOMADataFrameCreate(uri=uri, schema=sch, domain = list(soma_joinid = c(1, 100)))
rb1 <- arrow::record_batch(soma_joinid = bit64::as.integer64(1L:3L),
valint = 1L:3L,
valdbl = 100*(1:3),
Expand Down Expand Up @@ -813,7 +826,7 @@ test_that("missing levels in enums", {
# Create SOMADataFrame w/ missing enum levels
if (dir.exists(uri)) unlink(uri, recursive=TRUE)
tbl <- arrow::as_arrow_table(df)
sdf <- SOMADataFrameCreate(uri, tbl$schema)
sdf <- SOMADataFrameCreate(uri, tbl$schema, domain = list(soma_joinid = c(0, n-1)))
on.exit(sdf$close())
sdf$write(tbl)
sdf$close()
Expand Down Expand Up @@ -874,7 +887,7 @@ test_that("factor levels can grow without overlap", {
arrow::field(name = "obs_col_like",
type = arrow::dictionary(index_type = arrow::int8(), ordered = FALSE)))

sdf <- SOMADataFrameCreate(uri, schema)
sdf <- SOMADataFrameCreate(uri, schema, domain = list(soma_joinid = c(0, 5)))

tbl_1 <- arrow::arrow_table(soma_joinid = bit64::as.integer64(c(0,1,2)),
obs_col_like = factor(c("A", "B", "A")),
Expand Down Expand Up @@ -917,7 +930,7 @@ test_that("factor levels cannot extend beyond index limit", {
df <- data.frame(soma_joinid = bit64::as.integer64(seq_len(65)),
obs = factor(paste0("elem", seq_len(65))))
tbl <- arrow::as_arrow_table(df, schema = sch)
expect_silent(SOMADataFrameCreate(uri, sch)$write(tbl)$close())
expect_silent(SOMADataFrameCreate(uri, sch, domain = list(soma_joinid = c(0, 999)))$write(tbl)$close())

df2 <- data.frame(soma_joinid = bit64::as.integer64(65 + seq_len(65)),
obs = factor(paste0("elem_", 65 + seq_len(65))))
Expand Down
2 changes: 1 addition & 1 deletion apis/r/tests/testthat/test-Timestamps.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ test_that("SOMADataFrame", {

## create at t = 1
ts1 <- as.POSIXct(1, tz = "UTC", origin = "1970-01-01")
sdf <- tiledbsoma::SOMADataFrameCreate(uri, sch, tiledb_timestamp = ts1)
sdf <- tiledbsoma::SOMADataFrameCreate(uri, sch, tiledb_timestamp = ts1, domain = list(soma_joinid=c(0, 999)))

## write part1 at t = 2
dat2 <- arrow::arrow_table(soma_joinid = bit64::as.integer64(1L:5L),
Expand Down
8 changes: 7 additions & 1 deletion apis/r/tests/testthat/test-query-condition.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,13 @@ test_that("DataFrame Factory", {
# arrow::field("datetime_day", arrow::date32())
)

sdf <- SOMADataFrameCreate(uri, sch, index_column_names = "soma_joinid")
sdf <- SOMADataFrameCreate(
uri,
sch,
index_column_names = "soma_joinid",
domain = list(soma_joinid = c(0, 999))
)

expect_true(sdf$exists())
expect_true(dir.exists(uri))

Expand Down
3 changes: 2 additions & 1 deletion apis/r/tests/testthat/test-reopen.R
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ test_that("`reopen()` works on nested collections", {
soma_joinid = bit64::integer64(),
int = integer()
)),
index_column_names = "soma_joinid"
index_column_names = "soma_joinid",
domain = list(soma_joinid = c(0, 999))
)
expect_length(col$names(), 4L)

Expand Down
Loading

0 comments on commit 003fd1d

Please sign in to comment.