[Bug] ArrowInvalid: cannot construct ChunkedArray from empty vector and omitted type #3633

nick-youngblut · 2025-01-27T22:54:41Z

Describe the bug

The error when running tiledbsoma.io.from_anndata:

---------------------------------------------------------------------------
ArrowInvalid                              Traceback (most recent call last)
Cell In[28], line 2
      1 # ingest new data
----> 2 tiledbsoma.io.from_anndata(
      3     db_uri,
      4     adata,
      5     measurement_name="RNA",
      6     registration_mapping=rd,
      7 )

File ~/miniforge3/envs/tiledb/lib/python3.12/site-packages/tiledbsoma/io/ingest.py:567, in from_anndata(experiment_uri, anndata, measurement_name, context, platform_config, obs_id_name, var_id_name, X_layer_name, raw_X_layer_name, ingest_mode, use_relative_uri, X_kind, registration_mapping, uns_keys, additional_metadata)
    557 _maybe_ingest_uns(
    558     measurement,
    559     anndata.uns,
   (...)
    562     **ingest_platform_ctx,
    563 )
    565 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    566 # MS/meas/VAR
--> 567 with _write_dataframe(
    568     _util.uri_joinpath(measurement_uri, "var"),
    569     conversions.obs_or_var_to_tiledb_supported_array_type(anndata.var),
    570     id_column_name=var_id_name,
    571     # Layer existence is pre-checked in the registration phase
    572     axis_mapping=jidmaps.var_axes[measurement_name],
    573     **ingest_platform_ctx,
    574 ) as var:
    575     _maybe_set(measurement, "var", var, use_relative_uri=use_relative_uri)
    577 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    578 # MS/meas/X/DATA

File ~/miniforge3/envs/tiledb/lib/python3.12/site-packages/tiledbsoma/io/ingest.py:1287, in _write_dataframe(df_uri, df, id_column_name, ingestion_params, additional_metadata, platform_config, context, axis_mapping)
   1284 df[SOMA_JOINID] = np.asarray(axis_mapping.data, dtype=np.int64)
   1285 df.set_index(SOMA_JOINID, inplace=True)
-> 1287 return _write_dataframe_impl(
   1288     df,
   1289     df_uri,
   1290     id_column_name,
   1291     shape=axis_mapping.get_shape(),
   1292     ingestion_params=ingestion_params,
   1293     additional_metadata=additional_metadata,
   1294     original_index_metadata=original_index_metadata,
   1295     platform_config=platform_config,
   1296     context=context,
   1297 )

File ~/miniforge3/envs/tiledb/lib/python3.12/site-packages/tiledbsoma/io/ingest.py:1329, in _write_dataframe_impl(df, df_uri, id_column_name, shape, ingestion_params, additional_metadata, original_index_metadata, platform_config, context)
   1325     if id_column_name is None:
   1326         # Nominally, nil id_column_name only happens for uns append and we do not append uns,
   1327         # which is a concern for our caller. This is a second-level check.
   1328         raise ValueError("internal coding error: id_column_name unspecified")
-> 1329     arrow_table = _extract_new_values_for_append(df_uri, arrow_table, context)
   1331 try:
   1332     # Note: tiledbsoma.io creates dataframes with soma_joinid being the one
   1333     # and only index column.
   1334     domain = ((0, shape - 1),)

File ~/miniforge3/envs/tiledb/lib/python3.12/site-packages/tiledbsoma/io/ingest.py:1222, in _extract_new_values_for_append(df_uri, arrow_table, context)
   1218 try:
   1219     with _factory.open(
   1220         df_uri, "r", soma_type=DataFrame, context=context
   1221     ) as previous_soma_dataframe:
-> 1222         return _extract_new_values_for_append_aux(
   1223             previous_soma_dataframe, arrow_table
   1224         )
   1226 except DoesNotExistError:
   1227     return arrow_table

File ~/miniforge3/envs/tiledb/lib/python3.12/site-packages/tiledbsoma/io/ingest.py:1182, in _extract_new_values_for_append_aux(previous_soma_dataframe, arrow_table)
   1173         column = pa.chunked_array(
   1174             [chunk.dictionary_decode() for chunk in column.chunks]
   1175         )
   1177     elif is_cat(old_field) and not is_cat(new_field):
   1178         # Convert from non-categorical to categorical.  Note:
   1179         # libtiledbsoma already merges the enum mappings, e.g if the
   1180         # storage has red, yellow, & green, but our new data has some
   1181         # yellow, green, and orange.
-> 1182         column = pa.chunked_array(
   1183             [chunk.dictionary_encode() for chunk in column.chunks]
   1184         )
   1186     fields_dict[name] = column
   1187 arrow_table = pa.Table.from_pydict(fields_dict)

File ~/miniforge3/envs/tiledb/lib/python3.12/site-packages/pyarrow/table.pxi:1537, in pyarrow.lib.chunked_array()

File ~/miniforge3/envs/tiledb/lib/python3.12/site-packages/pyarrow/error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()

File ~/miniforge3/envs/tiledb/lib/python3.12/site-packages/pyarrow/error.pxi:92, in pyarrow.lib.check_status()

ArrowInvalid: cannot construct ChunkedArray from empty vector and omitted type

To Reproduce

import tiledbsoma
import tiledbsoma.io

import scanpy as sc

input_path = "/home/nickyoungblut/dev/tmp/tiledb/2025-01-24_23-55-08/STAR/SRX21101392/Gene/filtered"
srx_accession = "SRX21101392"

# Read 10x mtx data
adata = sc.read_10x_mtx(
    input_path,
    var_names="gene_ids",  
    make_unique=True  
)

# add SRX column
adata.obs["SRX_accession"] = [srx_accession] * len(adata.obs)

# create tiledb soma db
db_dir = os.path.join(work_dir, "MY_DATABASE")

## read from temp location and write to tiledb
if os.path.exists(db_dir):
    db_uri = db_dir
else:
    # write adata file to temp location
    temp_dir = tempfile.mkdtemp()
    h5ad_file = os.path.join(temp_dir, "adata.h5ad")
    adata.write_h5ad(h5ad_file)

    ## create db
    db_uri = tiledbsoma.io.from_h5ad(
        db_dir, 
        input_path = h5ad_file,
        measurement_name = "RNA"
    )


# append more data
input_path = "/home/nickyoungblut/dev/tmp/tiledb/2025-01-22_01-10-09/STAR/SRX24099779/Gene/filtered"
srx_accession = "SRX24099779"

# Read 10x mtx data
adata = sc.read_10x_mtx(
    input_path,
    var_names="gene_ids",  
    make_unique=True  
)

# add SRX column
adata.obs["SRX_accession"] = [srx_accession] * len(adata.obs)

# register
rd = tiledbsoma.io.register_anndatas(
    db_uri,
    [adata],
    measurement_name="RNA",
    obs_field_name="obs_id",
    var_field_name="var_id",
)

# apply resize
with tiledbsoma.Experiment.open(db_uri) as exp:
    tiledbsoma.io.resize_experiment(
        exp.uri, 
        nobs=rd.get_obs_shape(), 
        nvars=rd.get_var_shapes()
    )

# ingest new data into the db
tiledbsoma.io.from_anndata(
    db_uri,
    adata,
    measurement_name="RNA",
    registration_mapping=rd,
)

Versions (please complete the following information):

TileDB-SOMA version: 1.15.4
Language and language version (e.g. Python 3.9, R 4.3.2): 3.12.8
OS (e.g. MacOS, Ubuntu Linux): Ubuntu

Additional context

The data is ingested, but there is still the error. So, I have to use:

try:
    tiledbsoma.io.from_anndata(
        db_uri,
        adata,
        measurement_name="RNA",
        registration_mapping=rd,
    )
except ArrowInvalid:
    pass

I'm wondering if the issue is due to the 2nd dataset gene set matching perfectly with the 1st gene set, so there are zero rows to add (empty vector).

The text was updated successfully, but these errors were encountered:

johnkerl · 2025-01-27T23:10:03Z

I'm wondering if the issue is due to the 2nd dataset gene set matching perfectly with the 1st gene set, so there are zero rows to add (empty vector).

Hi @nick-youngblut ! It's definitely the case that there aren't new rows to add -- given you're registering with

    obs_field_name="obs_id",
    var_field_name="var_id",

and your

# add SRX column
adata.obs["SRX_accession"] = [srx_accession] * len(adata.obs)

didn't modify obs_id, so there are no new rows in obs.

So that's one issue -- if you want to add more obs rows, either mutate the new rows' obs_id, or, use a different column name (other than obs_id) to tell us which column is the ID column for obs.

The other issue though -- you shoudn't be getting the ChunkedArray error message we're giving you. This feels like a bug on our part. I did try a simple ingest-and-register-and-ingest again with the exact same (non-10X) dataset and didn't get the error you did, so there must be something a bit corner-casey going on here.

I'll investigate.

johnkerl · 2025-01-27T23:12:08Z

A third possible issue: append-mode ingest is for more data with all the same column schema. Does adata.obs["SRX_accession"] exist in your original (before-the-append) data from 10X?

nick-youngblut · 2025-01-28T16:14:12Z

Thanks @johnkerl for the detailed feedback!

Does adata.obs["SRX_accession"] exist in your original (before-the-append) data from 10X?

It does. Still, I will double check.

I'll also do some investigation on my end. If you need the data, I could provide it, since the data is already published on the SRA (hence, the SRX accessions).

johnkerl · 2025-01-28T16:32:51Z

@nick-youngblut yes, if it's not too much to ask, having access to the data would indeed be super-helpful 🙏

nick-youngblut · 2025-01-30T02:13:43Z

I haven't been able to reproduce this issue. I'm not sure why it happened, but it hasn't happened since. 🤷

cbrueffer · 2025-01-31T13:52:55Z

I'm seeing the same issue (after fixing #3641) in a similar write-then-append scenario. All datasets have the same obs columns schema, and the same gene IDs (padded across datasets, introducing NaNs in X and var columns for the padded genes). I haven't been able to come up with minimal datasets yet, but will post here once that happens.

cbrueffer · 2025-01-31T19:41:03Z

Here are some updates: Using the code linked in the above PR (just re-written to ingest using from_anndata) I'm testing with two datasets, both have 40405 genes. Doing write-then-append, with the full 40405 genes I get the ChunkedArray error. If I subset the datasets to 40404 genes, things work successfully. Could be an off-by-one problem somewhere?

Update: it worked for those two files, but appending others subset in the same way started failing with the ChunkedArray issue again.

johnkerl · 2025-02-03T20:27:21Z

@cbrueffer Thanks!

If we at TileDB can get an on-demand repro I think we can solve this pretty quickly ... 🤔

johnkerl · 2025-02-03T20:28:36Z

If I subset the datasets to 40404 genes, things work successfully. Could be an off-by-one problem somewhere?

I think it's more likely there's a threshold being crossed (total number of bytes, say) wherein the number of chunks in a particular ChunkedArray object goes from 1 to 2 ... 🤔

We and other customer have ingested a huge range of sizes of data over the years and yet I haven't see this particular symptom before so there must be something engagingly/puzzlingly corner-casey going on here ......

johnkerl · 2025-02-03T20:31:15Z

@nick-youngblut @cbrueffer can you please share your pyarrow version BTW? (Just one more debug datapoint ...)

cbrueffer · 2025-02-04T09:56:11Z

Thanks John, I'll keep trying to come up with a good reproduction case. My package versions:

tiledb 0.33.3
tiledbsoma 1.15.1
pyarrow 18.1.0

johnkerl · 2025-02-04T15:22:58Z

Thanks @cbrueffer ! We'll keep trying for a repro as well

jp-dark · 2025-02-12T21:58:38Z

I wasn't able to reproduce the issue, but I think I identified at least part of the issue. We are converting sufficiently large categorical pandas data to non-categorical, but then we need to convert it back when we append it to a var dataframe with an existing categorical column in the schema. This explains why the issue is size dependent for some datasets.

The arrow array for the categorical column is also getting dropped in your case, and I wasn't able to reproduce that part, but I suspect it is caused by the casting to non-categorical and back again. I'm working on a fix to remove this round-trip, and hopefully that will fix the bug.

cbrueffer · 2025-02-13T10:27:26Z

That great news Julia! I can reproduce the issue, so I should be able to verify whether a fix works or not. I'm currently working on clearing the datasets in question internally so I may be able to share them soon.

cbrueffer · 2025-02-14T11:08:30Z

Hi @jp-dark , thanks for looking into this issue! Here are links to the two test files (both small, 1 cell x 40505 genes each):

https://insilicoconsultingab-my.sharepoint.com/:u:/g/personal/christian_brueffer_insilico_consulting/EWqTilqnmEJOsON1tcl_DK8BrnA81zMx_jTWE8uvUgOXog?e=EBxWLo
https://insilicoconsultingab-my.sharepoint.com/:u:/g/personal/christian_brueffer_insilico_consulting/EY6my0dmauBKgErZQPCxDJ4BBhShnx8PFjVdKA9TRU25Hg?e=dosLLx

Using tiledb 0.33.3, tiledbsoma 1.15.1, and pyarrow 18.1.0, those files and the following code reproduce the issue for me:

import tiledbsoma.io
import scanpy as sc

tiledb_path = "tiledb_test"

adata1 = "tiledb_crash_test1_1x40405.h5ad"
adata2 = "tiledb_crash_test2_1x40405.h5ad"

OBS_ID = "obs_id"
VAR_ID = "var_id"

# create TileDB
tiledbsoma.io.from_anndata(tiledb_path, anndata = adata1, measurement_name = "RNA", obs_id_name = OBS_ID, var_id_name = VAR_ID)

# add second dataset to TileDB
rd = tiledbsoma.io.register_anndatas(tiledb_path, [adata2], measurement_name = "RNA", obs_field_name = OBS_ID, var_field_name = VAR_ID)
tiledbsoma.io.resize_experiment(tiledb_path, nobs = rd.get_obs_shape(), nvars = rd.get_var_shape())
tiledbsoma.io.from_anndata(tiledb_path, anndata = adata2, measurement_name = "RNA", registration_mapping = rd)

jp-dark · 2025-02-14T16:01:48Z

Thanks @cbrueffer! I was able to reproduce the issue with this data.

jp-dark · 2025-02-14T19:53:26Z

We merged in a fix that will go out with the next release.

johnkerl · 2025-02-14T20:16:20Z

We merged in a fix that will go out with the next release.

I am hoping for our 1.16.0 sometime next week.

cbrueffer · 2025-02-14T20:53:14Z

Thanks a lot everyone!

johnkerl self-assigned this Jan 27, 2025

johnkerl assigned jp-dark Feb 3, 2025

johnkerl removed their assignment Feb 13, 2025

jp-dark mentioned this issue Feb 14, 2025

[python] Skip checks for append to a dataframe when no new data #3701

Merged

jp-dark closed this as completed Feb 14, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Bug] ArrowInvalid: cannot construct ChunkedArray from empty vector and omitted type #3633

[Bug] ArrowInvalid: cannot construct ChunkedArray from empty vector and omitted type #3633

nick-youngblut commented Jan 27, 2025

johnkerl commented Jan 27, 2025

johnkerl commented Jan 27, 2025

nick-youngblut commented Jan 28, 2025

johnkerl commented Jan 28, 2025

nick-youngblut commented Jan 30, 2025

cbrueffer commented Jan 31, 2025 •

edited

Loading

cbrueffer commented Jan 31, 2025 •

edited

Loading

johnkerl commented Feb 3, 2025

johnkerl commented Feb 3, 2025 •

edited

Loading

johnkerl commented Feb 3, 2025

cbrueffer commented Feb 4, 2025

johnkerl commented Feb 4, 2025

jp-dark commented Feb 12, 2025

cbrueffer commented Feb 13, 2025

cbrueffer commented Feb 14, 2025 •

edited

Loading

jp-dark commented Feb 14, 2025

jp-dark commented Feb 14, 2025

johnkerl commented Feb 14, 2025

cbrueffer commented Feb 14, 2025

[Bug] ArrowInvalid: cannot construct ChunkedArray from empty vector and omitted type #3633

[Bug] ArrowInvalid: cannot construct ChunkedArray from empty vector and omitted type #3633

Comments

nick-youngblut commented Jan 27, 2025

johnkerl commented Jan 27, 2025

johnkerl commented Jan 27, 2025

nick-youngblut commented Jan 28, 2025

johnkerl commented Jan 28, 2025

nick-youngblut commented Jan 30, 2025

cbrueffer commented Jan 31, 2025 • edited Loading

cbrueffer commented Jan 31, 2025 • edited Loading

johnkerl commented Feb 3, 2025

johnkerl commented Feb 3, 2025 • edited Loading

johnkerl commented Feb 3, 2025

cbrueffer commented Feb 4, 2025

johnkerl commented Feb 4, 2025

jp-dark commented Feb 12, 2025

cbrueffer commented Feb 13, 2025

cbrueffer commented Feb 14, 2025 • edited Loading

jp-dark commented Feb 14, 2025

jp-dark commented Feb 14, 2025

johnkerl commented Feb 14, 2025

cbrueffer commented Feb 14, 2025

cbrueffer commented Jan 31, 2025 •

edited

Loading

cbrueffer commented Jan 31, 2025 •

edited

Loading

johnkerl commented Feb 3, 2025 •

edited

Loading

cbrueffer commented Feb 14, 2025 •

edited

Loading