DataBiosphere · jessebrennan · Oct 8, 2021 · Aug 20, 2021 · Aug 23, 2021 · Sep 9, 2021
@@ -1238,10 +1238,17 @@ def get_project_data(project_id: Optional[str] = None) -> JSON:
                 schema.object(
                     additional_properties=True,
                     organTypes=schema.array(str),
-                    totalFileSize=int,
+                    totalFileSize=float,
                     fileTypeSummaries=array_of_object_spec,
-                    totalCellCount=int,
-                    cellCountSummaries=array_of_object_spec
+                    totalCellCount=float,
+                    cellCountSummaries=array_of_object_spec,
+                    projectEstimatedCellCount=float,
+                    donorCount=int,
+                    fileCount=int,
+                    labCount=int,
+                    projectCount=int,
+                    speciesCount=int,
+                    specimenCount=int,
                 )
             )
         }

@@ -156,6 +156,7 @@ def service_config(self) -> ServiceConfig:
                 "insdcProjectAccessions": "contents.projects.insdc_project_accessions",
                 "insdcStudyAccessions": "contents.projects.insdc_study_accessions",
                 "accessions": "contents.projects.accessions",
+                "projectEstimatedCellCount": "contents.projects.estimated_cell_count",
 
                 "biologicalSex": "contents.donors.biological_sex",
                 "sampleId": "contents.samples.biomaterial_id",
@@ -248,7 +249,8 @@ def service_config(self) -> ServiceConfig:
                     "project.contributors.institution": "institutions",
                     "project.contributors.laboratory": "laboratory",
                     "project.project_core.project_short_name": "project_short_name",
-                    "project.project_core.project_title": "project_title"
+                    "project.project_core.project_title": "project_title",
+                    "project.estimated_cell_count": "estimated_cell_count"
                 },
                 "contents.specimens": {
                     "specimen_from_organism.provenance.document_id": "document_id",

@@ -24,6 +24,7 @@
     FrequencySetAccumulator,
     GroupingAggregator,
     ListAccumulator,
+    MaxAccumulator,
     SetAccumulator,
     SetOfDictAccumulator,
     SimpleAggregator,
@@ -164,6 +165,8 @@ def _get_accumulator(self, field) -> Optional[Accumulator]:
                        'publications',
                        'accessions'):
             return None
+        elif field == 'estimated_cell_count':
+            return MaxAccumulator()
         else:
             return super()._get_accumulator(field)
 

@@ -602,7 +602,8 @@ def _project_types(cls) -> FieldTypes:
             'insdc_study_accessions': [null_str],
             'supplementary_links': [null_str],
             '_type': null_str,
-            'accessions': cls._accession_types()
+            'accessions': cls._accession_types(),
+            'estimated_cell_count': null_int
         }
 
     def _project(self, project: api.Project) -> MutableJSON:
@@ -647,7 +648,8 @@ def _project(self, project: api.Project) -> MutableJSON:
             'supplementary_links': sorted(project.supplementary_links),
             '_type': 'project',
             'accessions': sorted(map(self._accession, project.accessions),
-                                 key=itemgetter('namespace', 'accession'))
+                                 key=itemgetter('namespace', 'accession')),
+            'estimated_cell_count': project.estimated_cell_count
         }
 
     @classmethod

@@ -4,6 +4,7 @@
 import logging
 from operator import (
     attrgetter,
+    itemgetter,
 )
 from typing import (
     ClassVar,
@@ -93,18 +94,20 @@ def add_doc(self, doc: JSON):
         file_relations = set()
         for entity_type, entities in contents.items():
             if entity_type != 'files':
-                for entity in entities:
-                    if 'document_id' in entity:
-                        entity = PFBEntity.from_json(name=entity_type,
-                                                     object_=entity,
-                                                     schema=self.schema)
-                        if entity not in self._entities:
-                            self._entities[entity] = set()
-                        file_relations.add(PFBRelation.to_entity(entity))
-                    else:
-                        # FIXME: Protocol entities lack document ID so we skip for now
-                        #        https://github.com/DataBiosphere/azul/issues/3084
-                        pass
+                # FIXME: Protocol entities lack document ID so we skip for now
+                #        https://github.com/DataBiosphere/azul/issues/3084
+                entities = (e for e in entities if 'document_id' in e)
+                # Sorting entities is required for deterministic output since
+                # the order of the inner entities in an aggregate document is
+                # tied to the order with which contributions are returned by ES
+                # during aggregation, which happens to be non-deterministic.
+                for entity in sorted(entities, key=itemgetter('document_id')):
+                    entity = PFBEntity.from_json(name=entity_type,
+                                                 object_=entity,
+                                                 schema=self.schema)
+                    if entity not in self._entities:
+                        self._entities[entity] = set()
+                    file_relations.add(PFBRelation.to_entity(entity))
         # File entities are assumed to be unique
         file_entity = PFBEntity.from_json(name='files',
                                           object_=one(contents['files']),
@@ -438,6 +441,7 @@ def _entity_schema_recursive(field_types: FieldTypes,
             # Exceptions are fields that do not become lists during aggregation
             exceptions = (
                 'donor_count',
+                'estimated_cell_count',
                 'submission_date',
                 'total_estimated_cells',
                 'update_date',

@@ -534,8 +534,13 @@ def transform_summary(self,
                                   'terms',
                                   field='contents.samples.effective_organ.keyword',
                                   size=config.terms_aggregation_size)
+        elif entity_type == 'projects':
+            # Add a project cell count aggregate
+            es_search.aggs.metric('projectEstimatedCellCount',
+                                  'sum',
+                                  field='contents.projects.estimated_cell_count_')
         else:
-            assert entity_type == 'projects', entity_type
+            assert False, entity_type
 
         cardinality_aggregations = {
             'samples': {

@@ -34,6 +34,7 @@
     to_camel_case,
 )
 from azul.types import (
+    AnyJSON,
     JSON,
 )
 
@@ -83,17 +84,17 @@ class FileTypeSummary(JsonObject):
     #        https://github.com/DataBiosphere/azul/issues/3180
     source = ListProperty()  # List could have string(s) and/or None
     count = IntegerProperty()
-    totalSize = IntegerProperty()
-    matrixCellCount = IntegerProperty()
+    totalSize = FloatProperty()
+    matrixCellCount = FloatProperty()
     isIntermediate = BooleanProperty()
     contentDescription = ListProperty()  # List could have string(s) and/or None
 
     @classmethod
     def for_bucket(cls, bucket: JSON) -> 'FileTypeSummary':
         self = cls()
         self.count = bucket['doc_count']
-        self.totalSize = int(bucket['size_by_type']['value'])  # Casting to integer since ES returns a double
-        self.matrixCellCount = int(bucket['matrix_cell_count_by_type']['value'])
+        self.totalSize = bucket['size_by_type']['value']
+        self.matrixCellCount = bucket['matrix_cell_count_by_type']['value']
         self.format = bucket['key']
         # FIXME: Remove deprecated field 'fileType'
         #        https://github.com/DataBiosphere/azul/issues/3180
@@ -162,6 +163,7 @@ class SummaryRepresentation(JsonObject):
     donorCount = IntegerProperty()
     labCount = IntegerProperty()
     totalCellCount = FloatProperty()
+    projectEstimatedCellCount = FloatProperty()
     organTypes = ListProperty(StringProperty(required=False))
     fileTypeSummaries = ListProperty(FileTypeSummary)
     cellCountSummaries = ListProperty(OrganCellCountSummary)
@@ -237,7 +239,7 @@ def __init__(self, aggregations):
         self.aggregations = aggregations
 
     def return_response(self):
-        def agg_value(*path: str) -> JSON:
+        def agg_value(*path: str) -> AnyJSON:
             agg = self.aggregations
             for name in path:
                 agg = agg[name]
@@ -257,6 +259,7 @@ def agg_values(function: Callable[[JSON], T], *path: str) -> List[T]:
             donorCount=agg_value('donorCount', 'value'),
             labCount=agg_value('labCount', 'value'),
             totalCellCount=agg_value('totalCellCount', 'value'),
+            projectEstimatedCellCount=agg_value('projectEstimatedCellCount', 'value'),
             organTypes=agg_values(OrganType.for_bucket,
                                   'organTypes', 'buckets'),
             fileTypeSummaries=agg_values(FileTypeSummary.for_bucket,
@@ -332,10 +335,11 @@ def make_projects(self, entry):
         for project in contents["projects"]:
             translated_project = {
                 **self._make_entity(project),
-                "projectId": project['document_id'],
-                "projectTitle": project.get("project_title"),
-                "projectShortname": project["project_short_name"],
-                "laboratory": sorted(set(project.get("laboratory", [None])))
+                'projectId': project['document_id'],
+                'projectTitle': project.get('project_title'),
+                'projectShortname': project['project_short_name'],
+                'laboratory': sorted(set(project.get('laboratory', [None]))),
+                'estimatedCellCount': project['estimated_cell_count'],
             }
             if self.entity_type in ('projects', 'bundles'):
                 entity = one(entry['contents']['aggregate_dates'])

@@ -157,7 +157,8 @@ def get_summary(self, catalog: CatalogName, filters):
             ],
             'projects': [
                 'project',
-                'labCount'
+                'labCount',
+                'projectEstimatedCellCount'
             ],
             'cell_suspensions': [
                 'totalCellCount',