From 130ccc60007a3df1d40fb5e8902db027443fe728 Mon Sep 17 00:00:00 2001 From: John Kerl Date: Wed, 30 Oct 2024 17:18:52 -0400 Subject: [PATCH] [python] Fix some dense+2.27 failing test cases (#3265) * Fix for dense domain/maxdomain with core 2.27 * improve a comment * Fix a one-off unit-test issue with dense + 2.27 * Make this file loggable via `$SPDLOG_LEVEL` like other files * misc neatens * missed a spot * spec-conformity * fix ci --- apis/python/src/tiledbsoma/_dense_nd_array.py | 53 +++++++++++++------ .../python/src/tiledbsoma/_sparse_nd_array.py | 4 +- apis/python/src/tiledbsoma/io/ingest.py | 9 ++-- apis/python/tests/test_io.py | 4 +- apis/python/tests/test_shape.py | 7 ++- apis/r/R/SOMADenseNDArray.R | 2 +- 6 files changed, 53 insertions(+), 26 deletions(-) diff --git a/apis/python/src/tiledbsoma/_dense_nd_array.py b/apis/python/src/tiledbsoma/_dense_nd_array.py index c2c8feff37..95d90a411a 100644 --- a/apis/python/src/tiledbsoma/_dense_nd_array.py +++ b/apis/python/src/tiledbsoma/_dense_nd_array.py @@ -101,9 +101,25 @@ def create( index_column_schema = [] index_column_data = {} + for dim_idx, dim_shape in enumerate(shape): dim_name = f"soma_dim_{dim_idx}" + pa_field = pa.field(dim_name, pa.int64()) + index_column_schema.append(pa_field) + + # Here is our Arrow data API for communicating schema info between + # Python/R and C++ libtiledbsoma: + # + # [0] core max domain lo + # [1] core max domain hi + # [2] core extent parameter + # If present, these next two signal to use the current-domain feature: + # [3] core current domain lo + # [4] core current domain hi + + if dim_shape is None: + raise ValueError("DenseNDArray shape slots must be numeric") if NEW_SHAPE_FEATURE_FLAG_ENABLED and DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN: dim_capacity, dim_extent = cls._dim_capacity_and_extent( @@ -117,11 +133,6 @@ def create( if dim_shape == 0: raise ValueError("DenseNDArray shape slots must be at least 1") - if dim_shape is None: - # Core current-domain semantics are (lo, hi) with both - # inclusive, with lo <= hi. This means smallest is (0, 0) - # which is shape 1, not 0. - dim_shape = 1 index_column_data[pa_field.name] = [ 0, @@ -138,8 +149,7 @@ def create( TileDBCreateOptions.from_platform_config(platform_config), ) - index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent] - index_column_schema.append(pa_field) + index_column_data[pa_field.name] = [0, dim_capacity - 1, dim_extent] index_column_info = pa.RecordBatch.from_pydict( index_column_data, schema=pa.schema(index_column_schema) @@ -349,16 +359,25 @@ def _dim_capacity_and_extent( dim_shape: Optional[int], create_options: TileDBCreateOptions, ) -> Tuple[int, int]: - """Given a user-specified shape along a particular dimension, returns a tuple of - the TileDB capacity and extent for that dimension, suitable for schema creation. - The user-specified shape cannot be ``None`` for :class:`DenseNDArray`. + """Given a user-specified shape (maybe ``None``) along a particular dimension, + returns a tuple of the TileDB capacity and extent for that dimension, suitable + for schema creation. If the user-specified shape is None, the largest possible + int64 is returned for the capacity -- which is particularly suitable for + maxdomain. """ - if dim_shape is None or dim_shape <= 0: - raise ValueError( - "SOMADenseNDArray shape must be a non-zero-length tuple of positive ints" - ) - - dim_capacity = dim_shape - dim_extent = min(dim_shape, create_options.dim_tile(dim_name, 2048)) + if dim_shape is None: + dim_capacity = 2**63 - 1 + dim_extent = min(dim_capacity, create_options.dim_tile(dim_name, 2048)) + # For core: "domain max expanded to multiple of tile extent exceeds max value + # representable by domain type. Reduce domain max by 1 tile extent to allow for + # expansion." + dim_capacity -= dim_extent + else: + if dim_shape <= 0: + raise ValueError( + "SOMASparseNDArray shape must be a non-zero-length tuple of positive ints or Nones" + ) + dim_capacity = dim_shape + dim_extent = min(dim_shape, create_options.dim_tile(dim_name, 2048)) return (dim_capacity, dim_extent) diff --git a/apis/python/src/tiledbsoma/_sparse_nd_array.py b/apis/python/src/tiledbsoma/_sparse_nd_array.py index ddc647a6b7..585ab90bce 100644 --- a/apis/python/src/tiledbsoma/_sparse_nd_array.py +++ b/apis/python/src/tiledbsoma/_sparse_nd_array.py @@ -150,7 +150,6 @@ def create( dim_name = f"soma_dim_{dim_idx}" pa_field = pa.field(dim_name, pa.int64()) - index_column_schema.append(pa_field) # Here is our Arrow data API for communicating schema info between @@ -504,7 +503,8 @@ def _dim_capacity_and_extent( """Given a user-specified shape (maybe ``None``) along a particular dimension, returns a tuple of the TileDB capacity and extent for that dimension, suitable for schema creation. If the user-specified shape is None, the largest possible - int64 is returned for the capacity. + int64 is returned for the capacity -- which is particularly suitable for + maxdomain. """ if dim_shape is None: dim_capacity = 2**63 - 1 diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index 81fb1d80f5..cfe2ec9c9c 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -64,7 +64,10 @@ NotCreateableError, SOMAError, ) -from .._flags import NEW_SHAPE_FEATURE_FLAG_ENABLED +from .._flags import ( + DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN, + NEW_SHAPE_FEATURE_FLAG_ENABLED, +) from .._soma_array import SOMAArray from .._soma_object import AnySOMAObject, SOMAObject from .._tdb_handles import RawHandle @@ -1322,7 +1325,7 @@ def _create_from_matrix( # in the case when multiple H5ADs/AnnDatas are being # ingested to an experiment which doesn't pre-exist. shape = (axis_0_mapping.get_shape(), axis_1_mapping.get_shape()) - elif cls.is_sparse: + elif cls.is_sparse or DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN: shape = tuple(None for _ in matrix.shape) else: shape = matrix.shape @@ -1909,7 +1912,7 @@ def _write_matrix_to_denseNDArray( else: tensor = pa.Tensor.from_numpy(chunk.toarray()) if matrix.ndim == 2: - soma_ndarray.write((slice(i, i2), slice(None)), tensor) + soma_ndarray.write((slice(i, i2), slice(0, ncol)), tensor) else: soma_ndarray.write((slice(i, i2),), tensor) diff --git a/apis/python/tests/test_io.py b/apis/python/tests/test_io.py index da1f8eb2e3..157bdcc90d 100644 --- a/apis/python/tests/test_io.py +++ b/apis/python/tests/test_io.py @@ -45,7 +45,7 @@ def src_matrix(request): ], indirect=True, ) -def test_io_create_from_matrix_Dense_nd_array(tmp_path, tdb_create_options, src_matrix): +def test_io_create_from_matrix_dense_nd_array(tmp_path, tdb_create_options, src_matrix): """ Test soma.io.from_matrix to a DenseNDArray. @@ -94,7 +94,7 @@ def test_io_create_from_matrix_Dense_nd_array(tmp_path, tdb_create_options, src_ ], indirect=True, ) -def test_io_create_from_matrix_Sparse_nd_array( +def test_io_create_from_matrix_sparse_nd_array( tmp_path, tdb_create_options, src_matrix ): """ diff --git a/apis/python/tests/test_shape.py b/apis/python/tests/test_shape.py index d74264751c..4777a4d6b0 100644 --- a/apis/python/tests/test_shape.py +++ b/apis/python/tests/test_shape.py @@ -205,7 +205,12 @@ def test_dense_nd_array_basics(tmp_path): with tiledbsoma.DenseNDArray.open(uri) as dnda: assert dnda.shape == (100, 200) - assert dnda.maxshape == (100, 200) + if tiledbsoma._flags.DENSE_ARRAYS_CAN_HAVE_CURRENT_DOMAIN: + assert len(dnda.maxshape) + assert dnda.maxshape[0] > 2**62 + assert dnda.maxshape[1] > 2**62 + else: + assert dnda.maxshape == (100, 200) assert dnda.non_empty_domain() == ((0, 0), (0, 0)) diff --git a/apis/r/R/SOMADenseNDArray.R b/apis/r/R/SOMADenseNDArray.R index 7597bb9660..90c41928d3 100644 --- a/apis/r/R/SOMADenseNDArray.R +++ b/apis/r/R/SOMADenseNDArray.R @@ -35,7 +35,7 @@ SOMADenseNDArray <- R6::R6Class( read_arrow_table = function( coords = NULL, result_order = "auto", - log_level = "warn" + log_level = "auto" ) { private$check_open_for_read()