diff --git a/apis/python/src/tiledbsoma/io/ingest.py b/apis/python/src/tiledbsoma/io/ingest.py index 230e3424e8..560c9d7e6c 100644 --- a/apis/python/src/tiledbsoma/io/ingest.py +++ b/apis/python/src/tiledbsoma/io/ingest.py @@ -2190,16 +2190,42 @@ def _ingest_uns_string_array( However, ``SOMADataFrame`` _requires_ that soma_joinid be present, either as an index column, or as a data column. The former is less confusing. """ - if len(value.shape) != 1: + + if len(value.shape) == 1: + helper = _ingest_uns_1d_string_array + elif len(value.shape) == 2: + helper = _ingest_uns_2d_string_array + else: msg = ( f"Skipped {coll.uri}[{key!r}]" - f" (uns object): string-array is not one-dimensional" + f" (uns object): string array is neither one-dimensional nor two-dimensional" ) logging.log_io(msg, msg) return + helper( + coll=coll, + key=key, + value=value, + platform_config=platform_config, + context=context, + use_relative_uri=use_relative_uri, + ingestion_params=ingestion_params, + ) + + +def _ingest_uns_1d_string_array( + coll: AnyTileDBCollection, + key: str, + value: NPNDArray, + platform_config: Optional[PlatformConfig], + context: Optional[SOMATileDBContext], + *, + use_relative_uri: Optional[bool], + ingestion_params: IngestionParams, +) -> None: + """Helper for ``_ingest_uns_string_array``""" n = len(value) - df_uri = _util.uri_joinpath(coll.uri, key) df = pd.DataFrame( data={ "soma_joinid": np.arange(n, dtype=np.int64), @@ -2208,6 +2234,41 @@ def _ingest_uns_string_array( ) df.set_index("soma_joinid", inplace=True) + df_uri = _util.uri_joinpath(coll.uri, key) + with _write_dataframe_impl( + df, + df_uri, + None, + ingestion_params=ingestion_params, + platform_config=platform_config, + context=context, + ) as soma_df: + _maybe_set(coll, key, soma_df, use_relative_uri=use_relative_uri) + + +def _ingest_uns_2d_string_array( + coll: AnyTileDBCollection, + key: str, + value: NPNDArray, + platform_config: Optional[PlatformConfig], + context: Optional[SOMATileDBContext], + *, + use_relative_uri: Optional[bool], + ingestion_params: IngestionParams, +) -> None: + """Helper for ``_ingest_uns_string_array``. Even if the 2D array is 1xN or Nx1, we + must nonetheless keep this as 2D rather than flattening to length-N 1D. That's because + this ``uns`` data is solely of interest for AnnData ingest/outgest, and it must go + back out the way it came in.""" + num_rows, num_cols = value.shape + data: Dict[str, Any] = {"soma_joinid": np.arange(num_rows, dtype=np.int64)} + for j in range(num_cols): + column_name = f"values_{j}" + data[column_name] = [str(e) if e else "" for e in value[:, j]] + df = pd.DataFrame(data=data) + df.set_index("soma_joinid", inplace=True) + + df_uri = _util.uri_joinpath(coll.uri, key) with _write_dataframe_impl( df, df_uri, diff --git a/apis/python/testdata/pbmc3k.h5ad b/apis/python/testdata/pbmc3k.h5ad index e529a6d851..5484064982 100644 Binary files a/apis/python/testdata/pbmc3k.h5ad and b/apis/python/testdata/pbmc3k.h5ad differ diff --git a/apis/python/tests/test_basic_anndata_io.py b/apis/python/tests/test_basic_anndata_io.py index 4c1bd0b784..722d9136d3 100644 --- a/apis/python/tests/test_basic_anndata_io.py +++ b/apis/python/tests/test_basic_anndata_io.py @@ -40,8 +40,9 @@ def h5ad_file_with_obsm_holes(request): @pytest.fixture -def h5ad_file_uns_string_array(request): - # This has uns["louvain_colors"] with dtype.char == "U" +def h5ad_file_uns_string_arrays(request): + # This has uns["louvain_colors"] with dtype.char == "U". + # It also has uns["more_colors"] in the form '[[...]]', as often occurs in the wild. input_path = HERE.parent / "testdata/pbmc3k.h5ad" return input_path @@ -392,13 +393,13 @@ def test_ingest_uns(tmp_path: pathlib.Path, h5ad_file_extended): assert np.array_equal(got_pca_variance, original.uns["pca"]["variance"]) -def test_ingest_uns_string_array(h5ad_file_uns_string_array): +def test_ingest_uns_string_arrays(h5ad_file_uns_string_arrays): tempdir = tempfile.TemporaryDirectory() output_path = tempdir.name tiledbsoma.io.from_h5ad( output_path, - h5ad_file_uns_string_array.as_posix(), + h5ad_file_uns_string_arrays.as_posix(), measurement_name="RNA", ) @@ -406,9 +407,16 @@ def test_ingest_uns_string_array(h5ad_file_uns_string_array): with tiledbsoma.DataFrame.open( exp.ms["RNA"]["uns"]["louvain_colors"].uri ) as df: - contents = df.read().concat()["values"] - assert len(contents) == 8 - assert contents[0].as_py() == "#1f77b4" + contents = df.read().concat() + assert contents.shape == (8, 2) + assert len(contents["values"]) == 8 + assert contents["values"][0].as_py() == "#1f77b4" + + with tiledbsoma.DataFrame.open(exp.ms["RNA"]["uns"]["more_colors"].uri) as df: + contents = df.read().concat() + assert contents.shape == (8, 2) + assert len(contents["values_0"]) == 8 + assert contents["values_0"][0].as_py() == "#1f77b4" def test_add_matrix_to_collection(adata):