Skip to content

Commit

Permalink
[2/2] [r] Add support for estimated_cell_count in project.json (#3299)
Browse files Browse the repository at this point in the history
  • Loading branch information
dsotirho-ucsc authored and jessebrennan committed Oct 8, 2021
1 parent 00a5910 commit b91dbe0
Show file tree
Hide file tree
Showing 15 changed files with 936 additions and 112 deletions.
13 changes: 10 additions & 3 deletions lambdas/service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1238,10 +1238,17 @@ def get_project_data(project_id: Optional[str] = None) -> JSON:
schema.object(
additional_properties=True,
organTypes=schema.array(str),
totalFileSize=int,
totalFileSize=float,
fileTypeSummaries=array_of_object_spec,
totalCellCount=int,
cellCountSummaries=array_of_object_spec
totalCellCount=float,
cellCountSummaries=array_of_object_spec,
projectEstimatedCellCount=float,
donorCount=int,
fileCount=int,
labCount=int,
projectCount=int,
speciesCount=int,
specimenCount=int,
)
)
}
Expand Down
789 changes: 716 additions & 73 deletions lambdas/service/openapi.json

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ def service_config(self) -> ServiceConfig:
"insdcProjectAccessions": "contents.projects.insdc_project_accessions",
"insdcStudyAccessions": "contents.projects.insdc_study_accessions",
"accessions": "contents.projects.accessions",
"projectEstimatedCellCount": "contents.projects.estimated_cell_count",

"biologicalSex": "contents.donors.biological_sex",
"sampleId": "contents.samples.biomaterial_id",
Expand Down Expand Up @@ -248,7 +249,8 @@ def service_config(self) -> ServiceConfig:
"project.contributors.institution": "institutions",
"project.contributors.laboratory": "laboratory",
"project.project_core.project_short_name": "project_short_name",
"project.project_core.project_title": "project_title"
"project.project_core.project_title": "project_title",
"project.estimated_cell_count": "estimated_cell_count"
},
"contents.specimens": {
"specimen_from_organism.provenance.document_id": "document_id",
Expand Down
3 changes: 3 additions & 0 deletions src/azul/plugins/metadata/hca/aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
FrequencySetAccumulator,
GroupingAggregator,
ListAccumulator,
MaxAccumulator,
SetAccumulator,
SetOfDictAccumulator,
SimpleAggregator,
Expand Down Expand Up @@ -164,6 +165,8 @@ def _get_accumulator(self, field) -> Optional[Accumulator]:
'publications',
'accessions'):
return None
elif field == 'estimated_cell_count':
return MaxAccumulator()
else:
return super()._get_accumulator(field)

Expand Down
6 changes: 4 additions & 2 deletions src/azul/plugins/metadata/hca/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,7 +602,8 @@ def _project_types(cls) -> FieldTypes:
'insdc_study_accessions': [null_str],
'supplementary_links': [null_str],
'_type': null_str,
'accessions': cls._accession_types()
'accessions': cls._accession_types(),
'estimated_cell_count': null_int
}

def _project(self, project: api.Project) -> MutableJSON:
Expand Down Expand Up @@ -647,7 +648,8 @@ def _project(self, project: api.Project) -> MutableJSON:
'supplementary_links': sorted(project.supplementary_links),
'_type': 'project',
'accessions': sorted(map(self._accession, project.accessions),
key=itemgetter('namespace', 'accession'))
key=itemgetter('namespace', 'accession')),
'estimated_cell_count': project.estimated_cell_count
}

@classmethod
Expand Down
1 change: 1 addition & 0 deletions src/azul/service/avro_pfb.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ def _entity_schema_recursive(field_types: FieldTypes,
# Exceptions are fields that do not become lists during aggregation
exceptions = (
'donor_count',
'estimated_cell_count',
'submission_date',
'total_estimated_cells',
'update_date',
Expand Down
7 changes: 6 additions & 1 deletion src/azul/service/elasticsearch_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,8 +534,13 @@ def transform_summary(self,
'terms',
field='contents.samples.effective_organ.keyword',
size=config.terms_aggregation_size)
elif entity_type == 'projects':
# Add a project cell count aggregate
es_search.aggs.metric('projectEstimatedCellCount',
'sum',
field='contents.projects.estimated_cell_count_')
else:
assert entity_type == 'projects', entity_type
assert False, entity_type

cardinality_aggregations = {
'samples': {
Expand Down
22 changes: 13 additions & 9 deletions src/azul/service/hca_response_v5.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
to_camel_case,
)
from azul.types import (
AnyJSON,
JSON,
)

Expand Down Expand Up @@ -83,17 +84,17 @@ class FileTypeSummary(JsonObject):
# https://github.com/DataBiosphere/azul/issues/3180
source = ListProperty() # List could have string(s) and/or None
count = IntegerProperty()
totalSize = IntegerProperty()
matrixCellCount = IntegerProperty()
totalSize = FloatProperty()
matrixCellCount = FloatProperty()
isIntermediate = BooleanProperty()
contentDescription = ListProperty() # List could have string(s) and/or None

@classmethod
def for_bucket(cls, bucket: JSON) -> 'FileTypeSummary':
self = cls()
self.count = bucket['doc_count']
self.totalSize = int(bucket['size_by_type']['value']) # Casting to integer since ES returns a double
self.matrixCellCount = int(bucket['matrix_cell_count_by_type']['value'])
self.totalSize = bucket['size_by_type']['value']
self.matrixCellCount = bucket['matrix_cell_count_by_type']['value']
self.format = bucket['key']
# FIXME: Remove deprecated field 'fileType'
# https://github.com/DataBiosphere/azul/issues/3180
Expand Down Expand Up @@ -162,6 +163,7 @@ class SummaryRepresentation(JsonObject):
donorCount = IntegerProperty()
labCount = IntegerProperty()
totalCellCount = FloatProperty()
projectEstimatedCellCount = FloatProperty()
organTypes = ListProperty(StringProperty(required=False))
fileTypeSummaries = ListProperty(FileTypeSummary)
cellCountSummaries = ListProperty(OrganCellCountSummary)
Expand Down Expand Up @@ -237,7 +239,7 @@ def __init__(self, aggregations):
self.aggregations = aggregations

def return_response(self):
def agg_value(*path: str) -> JSON:
def agg_value(*path: str) -> AnyJSON:
agg = self.aggregations
for name in path:
agg = agg[name]
Expand All @@ -257,6 +259,7 @@ def agg_values(function: Callable[[JSON], T], *path: str) -> List[T]:
donorCount=agg_value('donorCount', 'value'),
labCount=agg_value('labCount', 'value'),
totalCellCount=agg_value('totalCellCount', 'value'),
projectEstimatedCellCount=agg_value('projectEstimatedCellCount', 'value'),
organTypes=agg_values(OrganType.for_bucket,
'organTypes', 'buckets'),
fileTypeSummaries=agg_values(FileTypeSummary.for_bucket,
Expand Down Expand Up @@ -332,10 +335,11 @@ def make_projects(self, entry):
for project in contents["projects"]:
translated_project = {
**self._make_entity(project),
"projectId": project['document_id'],
"projectTitle": project.get("project_title"),
"projectShortname": project["project_short_name"],
"laboratory": sorted(set(project.get("laboratory", [None])))
'projectId': project['document_id'],
'projectTitle': project.get('project_title'),
'projectShortname': project['project_short_name'],
'laboratory': sorted(set(project.get('laboratory', [None]))),
'estimatedCellCount': project['estimated_cell_count'],
}
if self.entity_type in ('projects', 'bundles'):
entity = one(entry['contents']['aggregate_dates'])
Expand Down
3 changes: 2 additions & 1 deletion src/azul/service/repository_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ def get_summary(self, catalog: CatalogName, filters):
],
'projects': [
'project',
'labCount'
'labCount',
'projectEstimatedCellCount'
],
'cell_suspensions': [
'totalCellCount',
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions test/indexer/test_hca_indexer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from bisect import (
insort,
)
from collections import (
Counter,
defaultdict,
Expand Down Expand Up @@ -56,6 +59,9 @@
config,
hmac,
)
from azul.collections import (
NestedDict,
)
from azul.deployment import (
aws,
)
Expand Down Expand Up @@ -1268,6 +1274,47 @@ def test_accessions_fields(self):
]
self.assertEqual(expected_accessions, project['accessions'])

def test_cell_counts(self):
"""
Verify the cell counts found in project, cell_suspension, and file entities
"""
# Bundles from the canned staging area, both for project 90bf705c
# https://github.com/HumanCellAtlas/schema-test-data/
bundle_fqid = self.bundle_fqid(uuid='4da04038-adab-59a9-b6c4-3a61242cc972',
version='2021-01-01T00:00:00.000000Z')
self._index_canned_bundle(bundle_fqid)
bundle_fqid = self.bundle_fqid(uuid='d7b8cbff-aee9-5a05-a4a1-d8f4e720aee7',
version='2021-01-01T00:00:00.000000Z')
self._index_canned_bundle(bundle_fqid)
hits = self._get_all_hits()

field_paths = [
('projects', 'estimated_cell_count'),
('cell_suspensions', 'total_estimated_cells'),
('files', 'matrix_cell_count')
]
actual = NestedDict(2, list)
for hit in sorted(hits, key=lambda d: d['_id']):
entity_type, aggregate = self._parse_index_name(hit)
contents = hit['_source']['contents']
for inner_entity_type, field_name in field_paths:
for inner_entity in contents[inner_entity_type]:
value = inner_entity[field_name]
insort(actual[aggregate][entity_type][inner_entity_type], value)

expected = NestedDict(1, dict)
for aggregate in False, True:
for entity_type in self.index_service.entity_types(self.catalog):
is_project_aggregate = aggregate and entity_type == 'projects'
expected[aggregate][entity_type] = {
# estimated_cell_count is aggregated using max, not sum
'projects': [10000] if is_project_aggregate else [10000, 10000],
'cell_suspensions': [40000] if is_project_aggregate else [20000, 20000],
'files': [17100] if is_project_aggregate else [2100, 15000]
}

self.assertEqual(expected.to_dict(), actual.to_dict())

def test_no_cell_count_contributions(self):
def assert_cell_suspension(expected: JSON, hits: List[JSON]):
project_hit = one(hit
Expand Down
Loading

0 comments on commit b91dbe0

Please sign in to comment.