-
Notifications
You must be signed in to change notification settings - Fork 26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize queries for allow_duplicates=False #208
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,20 @@ def __init__( | |
:param uri: URI of the TileDB group | ||
""" | ||
|
||
# People can (and should) call by name. However, it's easy to forget. For example, | ||
# if someone does 'tiledbsc.SOMA("myuri", ctx)' instead of 'tiledbsc.SOMA("myury", ctx)', | ||
# behavior will not be what they expect, and we should let them know sooner than later. | ||
if name is not None: | ||
assert isinstance(name, str) | ||
if soma_options is not None: | ||
assert isinstance(soma_options, SOMAOptions) | ||
if config is not None: | ||
assert isinstance(config, tiledb.Config) | ||
if ctx is not None: | ||
assert isinstance(ctx, tiledb.Ctx) | ||
if parent is not None: | ||
assert isinstance(parent, TileDBGroup) | ||
|
||
if ctx is None and config is not None: | ||
ctx = tiledb.Ctx(config) | ||
if soma_options is None: | ||
|
@@ -315,9 +329,18 @@ def query( | |
# obs, but no rows with cell_type == "blood". | ||
if slice_obs_df is None: | ||
return None | ||
obs_ids = list(slice_obs_df.index) | ||
if len(obs_ids) == 0: | ||
if len(slice_obs_df.index) == 0: | ||
return None | ||
# At the tiledb multi-index level, if we're say slicing on obs_ids but not var_ids, | ||
# we'll do `A.df[obs_ids, :]`. We can't pass a `:` down the callstack to get there, | ||
# but we pass `None` instead. | ||
# | ||
# It's important to do this. Say for example the X matrix is nobs=1000 by nvar=2000, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We were doing the following:
which is not new. However, it performed OK with |
||
# and we have a query that has 158 obs_ids. At the tiledb multi-index level, doing | ||
# `A.df[{158 obs ids}, {all 2000 var ids}]` is non-performant while | ||
# `A.df[{158 obs ids}, :]` is performant. | ||
if obs_ids is not None or obs_query_string is not None: | ||
obs_ids = list(slice_obs_df.index) | ||
|
||
slice_var_df = self.var.query( | ||
query_string=var_query_string, ids=var_ids, attrs=var_attrs | ||
|
@@ -326,9 +349,11 @@ def query( | |
# in its var, but no rows with feature_name == "MT-CO3". | ||
if slice_var_df is None: | ||
return None | ||
var_ids = list(slice_var_df.index) | ||
if len(var_ids) == 0: | ||
if len(slice_var_df.index) == 0: | ||
return None | ||
# See above comment re keeping obs_ids == None if that's what it came in as. | ||
if var_ids is not None or var_query_string is not None: | ||
var_ids = list(slice_var_df.index) | ||
|
||
# TODO: | ||
# do this here: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,20 @@ def __init__( | |
:param uri: URI of the TileDB group | ||
""" | ||
|
||
# People can (and should) call by name. However, it's easy to forget. For example, | ||
# if someone does 'tiledbsc.SOMACollection("myuri", ctx)' instead of 'tiledbsc.SOMA("myury", ctx)', | ||
# behavior will not be what they expect, and we should let them know sooner than later. | ||
if name is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Arg-checking as above, for |
||
assert isinstance(name, str) | ||
if soma_options is not None: | ||
assert isinstance(soma_options, SOMAOptions) | ||
if config is not None: | ||
assert isinstance(config, tiledb.Config) | ||
if ctx is not None: | ||
assert isinstance(ctx, tiledb.Ctx) | ||
if parent is not None: | ||
assert isinstance(parent, TileDBGroup) | ||
|
||
if ctx is None and config is not None: | ||
ctx = tiledb.Ctx(config) | ||
if soma_options is None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,37 +57,6 @@ def __init__( | |
# self.raw_var = raw_var | ||
assert "data" in X | ||
|
||
# Find the dtype. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code is vestigial and was adding unnecessary time to SOMA slice queries. |
||
X_data = self.X["data"] | ||
if isinstance(X_data, pd.DataFrame): | ||
X_dtype = X_data.dtypes["value"] | ||
else: | ||
X_dtype = X_data.dtype | ||
|
||
ann = ad.AnnData(obs=self.obs, var=self.var, dtype=X_dtype) | ||
|
||
for name, data in self.X.items(): | ||
# X comes in from TileDB queries as a 3-column dataframe with "obs_id", "var_id", and | ||
# "value". For AnnData we need to make it a sparse matrix. | ||
if isinstance(data, pd.DataFrame): | ||
# Make obs_id and var_id accessible as columns. | ||
data = data.reset_index() | ||
data = util.X_and_ids_to_sparse_matrix( | ||
data, | ||
"obs_id", # row_dim_name | ||
"var_id", # col_dim_name | ||
"value", # attr_name | ||
self.obs.index, | ||
self.var.index, | ||
) | ||
# We use AnnData as our in-memory storage. For SOMAs, all X layers are arrays within the | ||
# soma.X group; for AnnData, the 'data' layer is ann.X and all the others are in | ||
# ann.layers. | ||
if name == "data": | ||
ann.X = data | ||
else: | ||
ann.layers[name] = data | ||
|
||
# ---------------------------------------------------------------- | ||
def __repr__(self) -> str: | ||
""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Found during debug.
soma = tiledbsc.SOMA(uri, ctx)
is incorrect whilesoma = tiledbsc.SOMA(uri, ctx=ctx)
is correct. This needs to be surfaced proactively to the user.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You may want to enforce keyword-only arguments to avoid such incorrect usage:
def __init__(self, uri, *, name=None, ...)
Manual runtime type checking is considered unidiomatic in general and doesn't scale well with the complexity of annotations (think of
Dict[str, List[Optional[int]]
). For the cases that really need it, there are 3rd party libraries such as Typeguard or Pydantic that use and enforce annotations at runtime transparently.