diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0bcab1d12..4ffd2f3fd 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,32 @@ smaht-portal Change Log ---------- +0.124.0 +======= + +* 2024-11-20/dmichaels - branch: dmichaels-20241119-browse-view (PR-295) + +* Added module browse.py for /browse; adapted from fourfront/.../search.py/browse. + This is for ticket: https://hms-dbmi.atlassian.net/browse/C4-1184 + +* New endpoint /recent_files_summary which, by default, returns info for files released + within the past three months grouped by release-date, cell-line or donor, and + ile-description. The specific fields used for these groupings are: + - release-date: file_status_tracking.released + - cell-line: file_sets.libraries.analytes.samples.sample_sources.cell_line.code + - donor: donors.display_title + - file-dsecription: release_tracker_description + Note that release_tracker_description is a newer (2024-12) calcprop (PR-298/sn_file_release_tracker); + and included in this branch are these files from the branch sn_file_release_tracker: + - src/encoded/item_utils/file.py + - src/encoded/types/file.py + Added these new modules to support this new endpoint: + - src/encoded/recent_files_summary.py + - src/encoded/elasticsearch_utils.py (maybe move to dcicutils eventually) + - src/encoded/endpoint_utils.py (maybe move to dcicutils eventually) + This is for ticket: https://hms-dbmi.atlassian.net/browse/C4-1192 + + 0.123.0 ======= `PR 310 SN Add liquid category ` @@ -23,6 +49,7 @@ Change Log * Assay and sequencer codes value set to XX for DSA fasta files and chain files * For Supplementary Files, use `haplotype`, `target_assembly`, and `source_assembly` properties to create annotated filenames for chain and fasta files + 0.121.0 ======= `PR 300 SN Remove basecalling ` diff --git a/poetry.lock b/poetry.lock index 60da629ed..cad476ce2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -4015,6 +4015,21 @@ setuptools = "*" [package.extras] testing = ["pytest", "pytest-cov"] +[[package]] +name = "termcolor" +version = "2.5.0" +description = "ANSI color formatting for output in terminal" +category = "main" +optional = false +python-versions = ">=3.9" +files = [ + {file = "termcolor-2.5.0-py3-none-any.whl", hash = "sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8"}, + {file = "termcolor-2.5.0.tar.gz", hash = "sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -4475,4 +4490,4 @@ test = ["zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.9.1,<3.13" -content-hash = "85d3cfc258bd495fab8caf35d943f40fb9e3c7114fcd59f1661d380fe15a0c09" +content-hash = "72b303a0100150cc88c75fceb3b9ab1f2a5123686a6ef75bf8d2e4320cb0a6a9" diff --git a/pyproject.toml b/pyproject.toml index e12edb623..629c06aca 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "encoded" -version = "0.123.0" +version = "0.123.0.1b1" # TODO: To become 0.124.0 description = "SMaHT Data Analysis Portal" authors = ["4DN-DCIC Team "] license = "MIT" @@ -99,6 +99,7 @@ structlog = ">=19.2.0,<20" subprocess-middleware = "^0.3.0" supervisor = "^4.2.4" # Useful for picking apart pyproject.toml +termcolor = "^2.4.0" toml = ">=0.10.1,<1" tqdm = "^4.59.0" transaction = "^3.0.0" diff --git a/src/encoded/__init__.py b/src/encoded/__init__.py index 67a2a4fb6..9fa9f9453 100644 --- a/src/encoded/__init__.py +++ b/src/encoded/__init__.py @@ -313,6 +313,7 @@ def main(global_config, **local_config): if 'elasticsearch.server' in config.registry.settings: config.include('snovault.elasticsearch') config.include('snovault.search.search') + config.include('encoded.browse') config.include('snovault.search.compound_search') # this contains fall back url, so make sure it comes just before static_resoruces diff --git a/src/encoded/browse.py b/src/encoded/browse.py new file mode 100644 index 000000000..98eb0f37e --- /dev/null +++ b/src/encoded/browse.py @@ -0,0 +1,66 @@ +from pyramid.httpexceptions import HTTPFound +from pyramid.security import Authenticated +from pyramid.view import view_config +import structlog +from webob.multidict import MultiDict +from urllib.parse import urlencode +from snovault.search.search import search +from snovault.util import debug_log +from encoded.endpoints.recent_files_summary.recent_files_summary import recent_files_summary_endpoint + +log = structlog.getLogger(__name__) + +# 2024-11-19/dmichaels: Adapted from fourfront for C4-1184. + +def includeme(config): + config.add_route('browse', '/browse{slash:/?}') + config.add_route("recent_files_summary", "/recent_files_summary") + config.scan(__name__) + + +# DEFAULT_BROWSE_TYPE = "FileSet" +# DEFAULT_BROWSE_TYPE = "UnalignedReads" +# DEFAULT_BROWSE_TYPE = "OutputFile" + +DEFAULT_BROWSE_TYPE = "File" +DEFAULT_BROWSE_FACETS = ["file_size"] + +DEFAULT_BROWSE_PARAM_LISTS = { + "type": [DEFAULT_BROWSE_TYPE], + "additional_facet": DEFAULT_BROWSE_FACETS +} + +@view_config(route_name='browse', request_method='GET', permission='search') +@debug_log +def browse(context, request, search_type=DEFAULT_BROWSE_TYPE, return_generator=False): + """ + Simply use search results for browse view + Redirect to proper URL w. params if needed + """ + orig_params = request.params + for k,vals in DEFAULT_BROWSE_PARAM_LISTS.items(): + if k not in orig_params or orig_params[k] not in vals: + # Redirect to DEFAULT_BROWSE_PARAM_LISTS URL + next_qs = MultiDict() + for k2, v2list in DEFAULT_BROWSE_PARAM_LISTS.items(): + for v2 in v2list: + next_qs.add(k2, v2) + # Preserve other keys that arent in DEFAULT_BROWSE_PARAM_LISTS + for k2, v2 in orig_params.items(): + if k2 not in DEFAULT_BROWSE_PARAM_LISTS: + next_qs.add(k2, v2) + # next_qs.add("redirected_from", str(request.path_qs)) + return HTTPFound( + location=str(request.path) + '?' + urlencode(next_qs), + detail="Redirected from " + str(request.path_info) + ) + + # TODO + # No real /browse specific UI yet; initially just basically copied static/components/SearchView.js to BrowseView.js. + return search(context, request, search_type, return_generator, forced_type="Browse") + + +@view_config(route_name="recent_files_summary", request_method=["GET"], effective_principals=Authenticated) +@debug_log +def recent_files_summary(context, request): + return recent_files_summary_endpoint(context, request) diff --git a/src/encoded/endpoints/elasticsearch_utils.py b/src/encoded/endpoints/elasticsearch_utils.py new file mode 100644 index 000000000..daf4ca2e5 --- /dev/null +++ b/src/encoded/endpoints/elasticsearch_utils.py @@ -0,0 +1,546 @@ +from copy import deepcopy +from typing import Any, Callable, List, Optional, Tuple, Union + +AGGREGATION_MAX_BUCKETS = 100 +AGGREGATION_NO_VALUE = "No value" + + +def create_elasticsearch_aggregation_query(fields: List[str], + property_name: Optional[str] = None, + max_buckets: Optional[int] = None, + missing_value: Optional[str] = None, + include_missing: bool = False, + create_field_aggregation: Optional[Callable] = None, + create_field_filter: Optional[Callable] = None, + _toplevel: bool = True) -> dict: + + """ + Returns a dictionary representing an ElasticSearch aggregation query for the field names. + If more than one is given the the aggregation will be nested, one within another, for example, + given ["date_created", "donors.display_title", "release_tracker_description"] we would return + something like this: + + { + "aggregate_by_donor": { + "meta": { "field_name": "date_created" }, + "filter": { + "bool": { + "must": [ + {"exists": {"field": "embedded.date_created.raw"}}, + {"exists": {"field": "embedded.donors.display_title.raw"}}, + {"exists": {"field": "embedded.release_tracker_description.raw"}} + ] + } + }, + "aggs": { + "dummy_date_histogram": { + "date_histogram": { + "field": "embedded.date_created", + "calendar_interval": "month", + "format": "yyyy-MM", "missing": "1970-01", + "order": { "_key": "desc"} + }, + "aggs": { + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "terms": { + "field": "embedded.donors.display_title.raw", + "missing": "No value", "size": 100 + }, + "aggs": { + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "terms": { + "field": "embedded.release_tracker_description.raw", + "missing": "No value", "size": 100 + } + } + } + } + } + } + } + } + } + + The above example assumes that a create_field_aggregation function callable was passed as an argument + and that if/when its argument is date_created then it would have returned something like this: + + { + "date_histogram": { + "field": f"embedded.date_created", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + + It further assumes, that the include_missing argument is False (default), in which case items not part of + any of the specified aggregation fields would be filtered out. This demonstrates a slight complication with + this particular case where an extra level of aggregation needs to be introducts (dummy_date_histogram). + This extra bit of cruft, necessary to get the ElasticSearch query to work as expected, manifests itself in + the query result as well and is dispensed with using the prune_elasticsearch_aggregation_results function below. + """ + global AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE + + if isinstance(fields, str): + fields = [fields] + if not (isinstance(fields, list) and fields and isinstance(field := fields[0], str) and (field := field.strip())): + return {} + if not isinstance(missing_value, str): + missing_value = AGGREGATION_NO_VALUE + if not (isinstance(max_buckets, int) and (max_buckets > 0)): + max_buckets = AGGREGATION_MAX_BUCKETS + + if not (callable(create_field_aggregation) and + isinstance(field_aggregation := create_field_aggregation(field), dict)): + field_aggregation = { + "terms": { + "field": f"embedded.{field}.raw", + "missing": missing_value, + "size": max_buckets + } + } + + if not (isinstance(property_name, str) and (property_name := property_name.strip())): + property_name = field + + aggregation = {property_name: {"meta": {"field_name": field}}} + + if (include_missing is not True) and (_toplevel is True): + # Filtering out items which are not in any of the aggregations; this introduces complication if + # using date_histogram rather than simple terms, which we need add another level of aggregation + # just for the date_histogram; then the caller will need deal with (remove) it later. + extra_nesting_for_date_histogram_and_filter = "date_histogram" in field_aggregation + for field in fields: + if isinstance(field, str) and (field := field.strip()): + if not (callable(create_field_filter) and isinstance(filter := create_field_filter(field), dict)): + filter = { + "exists": { + "field": f"embedded.{field}.raw" + } + } + if not aggregation[property_name].get("filter"): + aggregation[property_name]["filter"] = {"bool": {"must": []}} + aggregation[property_name]["filter"]["bool"]["must"].append(filter) + else: + extra_nesting_for_date_histogram_and_filter = False + + if not extra_nesting_for_date_histogram_and_filter: + aggregation[property_name].update(field_aggregation) + + if nested_aggregation := create_elasticsearch_aggregation_query( + fields[1:], max_buckets=max_buckets, + missing_value=missing_value, + create_field_aggregation=create_field_aggregation, _toplevel=False): + if extra_nesting_for_date_histogram_and_filter: + aggregation[property_name]["aggs"] = \ + {"dummy_date_histogram": {**field_aggregation, "aggs": nested_aggregation}} + else: + aggregation[property_name]["aggs"] = nested_aggregation + return aggregation + + +def add_debugging_to_elasticsearch_aggregation_query(aggregation_query: dict) -> None: # noqa + top_hits_debug = {"aggs": {"top_hits_debug": {"top_hits": {"_source": False, + "docvalue_fields": ["_id"], "size": 100 }}}} + def add_debug_query(aggs: dict) -> None: # noqa + if "aggs" in aggs: + for _, agg in aggs["aggs"].items(): + add_debug_query(agg) + else: + aggs.update(top_hits_debug) + if isinstance(aggregation_query, dict) and isinstance(aggs := aggregation_query.get("aggs"), dict): + for agg in aggs.values(): + add_debug_query(agg) + + +def prune_elasticsearch_aggregation_results(results: dict) -> None: + """ + This removes any extra level(s) of aggregation (i.e. dummy_date_histogram) that may have been + introduced in the create_elasticsearch_aggregation_query function (above), for when/if both + a filter and a date_histogram are used together. + """ + if isinstance(results, dict): + for key in list(results.keys()): + if (key == "dummy_date_histogram") and isinstance(buckets := results[key].get("buckets"), list): + results["buckets"] = buckets + del results[key] + else: + prune_elasticsearch_aggregation_results(results[key]) + elif isinstance(results, list): + for element in results: + prune_elasticsearch_aggregation_results(element) + + +def merge_elasticsearch_aggregation_results(target: dict, source: dict, copy: bool = False) -> Optional[dict]: + """ + Merges the given second (source) argument into the given first (target) argument (in palce), recursively, both + of which are assumed to be ElasticSearch aggregation query results; doc_coiunt values are updated as expected. + If the given copy argument is True then then the merge is not done to the given target in-place, rather a copy + of it is made and the merge done to it. In eiter case the resultant merged target is returned. For example: + + target = { + "meta": {"field_name": "date_created"}, "doc_count": 15, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 13, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} + ] + } + } + ] + } + } + ] + } + + source = { + "meta": {"field_name": "date_created"}, "doc_count": 16, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 14, + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} + ] + } + } + ] + } + } + ] + } + + merge_elasticsearch_aggregation_results(target, source) == { + "meta": {"field_name": "date_created"}, "doc_count": 15, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 25, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} + ] + } + } + ] + }, + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} + ] + } + } + ] + } + } + ] + } + """ + + def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: + if isinstance(aggregation, dict) and isinstance(aggregation.get("buckets"), list): + if isinstance(field_name := aggregation.get("meta", {}).get("field_name"), str) and field_name: + if isinstance(aggregation_key, str) and aggregation_key: + if field_name != aggregation_key: + return None + return field_name + return None + + def get_nested_aggregation(aggregation: dict) -> Optional[dict]: + if isinstance(aggregation, dict): + for key in aggregation: + if get_aggregation_key(aggregation[key], key): + return aggregation[key] + return None + + def get_aggregation_bucket_value(aggregation_bucket: dict) -> Optional[Any]: + if isinstance(aggregation_bucket, dict): + return aggregation_bucket.get("key_as_string", aggregation_bucket.get("key")) + return None + + def get_aggregation_bucket_doc_count(aggregation_bucket: dict) -> Optional[int]: + if isinstance(aggregation_bucket, dict): + if isinstance(doc_count := aggregation_bucket.get("doc_count"), int): + return doc_count + return None + + def get_aggregation_total_buckets_doc_count(aggregation: dict) -> int: + buckets_doc_count = 0 + if get_aggregation_key(aggregation): + for aggregation_bucket in aggregation["buckets"]: + if (doc_count := get_aggregation_bucket_doc_count(aggregation_bucket)) is not None: + buckets_doc_count += doc_count + return buckets_doc_count + + def find_aggregation_bucket(aggregation: dict, value: str) -> Optional[dict]: + if get_aggregation_key(aggregation): + for aggregation_bucket in aggregation["buckets"]: + if get_aggregation_bucket_value(aggregation_bucket) == value: + return aggregation_bucket + return None + + def merge_results(target: dict, source: dict) -> Tuple[Optional[dict], Optional[int]]: + merged_item_count = 0 + if not ((aggregation_key := get_aggregation_key(source)) and (get_aggregation_key(target) == aggregation_key)): + return 0, None + for source_bucket in source["buckets"]: + if (((source_bucket_value := get_aggregation_bucket_value(source_bucket)) is None) or + ((source_bucket_item_count := get_aggregation_bucket_doc_count(source_bucket)) is None)): # noqa + continue + if (target_bucket := find_aggregation_bucket(target, source_bucket_value)): + if source_nested_aggregation := get_nested_aggregation(source_bucket): + if target_nested_aggregation := get_nested_aggregation(target_bucket): + merged_item_count, merged_results = merge_results(target_nested_aggregation, source_nested_aggregation) + if merged_results is None: + if source_nested_aggregation_key := get_aggregation_key(source_nested_aggregation): + target_bucket[source_nested_aggregation_key] = ( + source_nested_bucket := source_bucket[source_nested_aggregation_key]) + if (source_nested_bucket_item_count := + get_aggregation_total_buckets_doc_count(source_nested_bucket)) > 0: # noqa + target_bucket["doc_count"] += source_nested_bucket_item_count + merged_item_count += source_nested_bucket_item_count + elif merged_item_count > 0: + target_bucket["doc_count"] += merged_item_count + elif get_aggregation_bucket_doc_count(target_bucket) is not None: + target_bucket["doc_count"] += source_bucket_item_count + merged_item_count += source_bucket_item_count + else: + target["buckets"].append(source_bucket) + if isinstance(target.get("doc_count"), int): + target["doc_count"] += source_bucket_item_count + else: + target["doc_count"] = source_bucket_item_count + merged_item_count += source_bucket_item_count + return merged_item_count, target + + if copy is True: + target = deepcopy(target) + + merged_item_count, target = merge_results(target, source) + if (merged_item_count > 0) and (get_aggregation_bucket_doc_count(target) is not None): + target["doc_count"] += merged_item_count + + return target + + +def normalize_elasticsearch_aggregation_results(aggregation: dict, additional_properties: Optional[dict] = None, + remove_empty_items: bool = True, + retain_original_item_count: bool = False) -> dict: + + """ + Normalizes the given result of an ElasticSearch aggregation query into a more readable/consumable format. + For example, given the result of the the example for merge_elasticsearch_aggregation_results above as input, + this function would return something like this: + + normalize_elasticsearch_aggregation_results(aggregation_results) == { + "count": 25, + "items": [ + { + "name": "date_created", + "value": "2024-12", "count": 11, + "items": [ + { + "name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code", + "value": "COLO829T", "count": 1, + "items": [ + { + "name": "release_tracker_description", + "value": "WGS ONT PromethION 24 bam", "count": 1 + } + ] + }, + { + "name": "donors.display_title", + "value": "DAC_DONOR_COLO829", "count": 4, + "items": [ + { + "name": "release_tracker_description", + "value": "Fiber-seq PacBio Revio bam", "count": 4 + } + ] + } + ] + } + ] + } + """ + + def get_aggregation_key(aggregation: dict, aggregation_key: Optional[str] = None) -> Optional[str]: + # TODO: same as in merge_elasticsearch_aggregation_results function + if isinstance(aggregation, dict) and isinstance(aggregation.get("buckets"), list): + if isinstance(field_name := aggregation.get("meta", {}).get("field_name"), str) and field_name: + if isinstance(aggregation_key, str) and aggregation_key: + if field_name != aggregation_key: + return None + return field_name + return None + + def get_aggregation_bucket_value(aggregation_bucket: dict) -> Optional[Any]: + # TODO: same as in merge_elasticsearch_aggregation_results function + if isinstance(aggregation_bucket, dict): + return aggregation_bucket.get("key_as_string", aggregation_bucket.get("key")) + return None + + def get_aggregation_bucket_doc_count(aggregation_bucket: dict) -> Optional[int]: + # TODO: same as in merge_elasticsearch_aggregation_results function + if isinstance(aggregation_bucket, dict): + if isinstance(doc_count := aggregation_bucket.get("doc_count"), int): + return doc_count + return None + + def get_aggregation_bucket_debug_hits(aggregation_bucket: dict) -> List[str]: + debug_hits = [] + if isinstance(aggregation_bucket, dict): + if isinstance(aggregation_bucket.get("doc_count"), int): + if (isinstance(top_hits_debug := aggregation_bucket.get("top_hits_debug"), dict) and + isinstance(hits := top_hits_debug.get("hits"), dict) and + isinstance(hits := hits.get("hits"), list)): # noqa + for hit in hits: + if isinstance(hit, dict) and isinstance(hit := hit.get("_id"), str): + debug_hits.append(hit) + return debug_hits + + def get_nested_aggregations(data: dict) -> List[dict]: + results = [] + if isinstance(data, dict): + for key in data: + if get_aggregation_key(data[key]) and data[key]["buckets"]: + results.append(data[key]) + if not results: + if ((isinstance(data.get("buckets"), list) and data["buckets"]) or + (isinstance(data.get("key"), str) and isinstance(data.get("doc_count"), int))): # noqa + results.append(data) + return results + + def find_group_item(group_items: List[dict], value: Any) -> Optional[dict]: + if isinstance(group_items, list): + for group_item in group_items: + if isinstance(group_item, dict) and (value == group_item.get("value")): + return group_item + return None + + def normalize_results(aggregation: dict, + key: Optional[str] = None, value: Optional[str] = None, + additional_properties: Optional[dict] = None) -> dict: + + nonlocal remove_empty_items, retain_original_item_count + + if not (aggregation_key := get_aggregation_key(aggregation)): + return {} + + group_items = [] ; item_count = 0 # noqa + for bucket in aggregation["buckets"]: + if (((bucket_value := get_aggregation_bucket_value(bucket)) is None) or + ((bucket_item_count := get_aggregation_bucket_doc_count(bucket)) is None)): # noqa + continue + item_count += bucket_item_count + debug_hits = get_aggregation_bucket_debug_hits(bucket) + if nested_aggregations := get_nested_aggregations(bucket): + for nested_aggregation in nested_aggregations: + if normalized_aggregation := normalize_results(nested_aggregation, aggregation_key, bucket_value): + if normalized_aggregation["count"] != bucket_item_count: + if retain_original_item_count is True: + # The original doc_count value from the raw result may be different/lesser than/from + # the result we aggregate here because ElasticSearch aggregations actually are based + # on unique values. Should we use this as the real count value though it may look wrong. + normalized_aggregation["count"] = bucket_item_count + if group_item := find_group_item(group_items, bucket_value): + for normalized_aggregation_item in normalized_aggregation["items"]: + group_item["items"].append(normalized_aggregation_item) + group_item["count"] += normalized_aggregation_item["count"] + else: + group_item = normalized_aggregation + group_items.append(group_item) + else: + if (remove_empty_items is False) or (bucket_item_count > 0): + group_item = {"name": aggregation_key, "value": bucket_value, "count": bucket_item_count} + if debug_hits: + group_item["debug_elasticsearch_hits"] = debug_hits + group_items.append(group_item) + + if (remove_empty_items is not False) and (not group_items): + return {} + results = {"name": key, "value": value, "count": item_count, "items": group_items} + + if isinstance(additional_properties, dict) and additional_properties: + results = {**additional_properties, **results} + + if key is None: + del results["name"] + if value is None: + del results["value"] + + return results + + results = normalize_results(aggregation, additional_properties=additional_properties) + return results + + +def sort_normalized_aggregation_results(data: dict, sort: Union[bool, str, Callable, + List[Union[bool, str, Callable]]] = False) -> None: + + """ + Sorts the given *normalized* (see above) ElasticSearch aggregation results. + By default, this is by item (doc) count descending and secondarily by key value. + """ + + def sort_items(items: List[dict], sort: Union[bool, str, Callable]) -> None: + sort_function_default = lambda item: (-item.get("count", 0), item.get("value", "")) # noqa + if (sort is True) or (isinstance(sort, str) and (sort.strip().lower() == "default")): + items.sort(key=sort_function_default) + elif isinstance(sort, str) and (sort := sort.strip().lower()): + if sort.startswith("-"): + sort_reverse = True + sort = sort[1:] + else: + sort_reverse = False + if sort == "default": + items.sort(key=sort_function_default, reverse=sort_reverse) + elif (sort in ["key", "value"]): + items.sort(key=lambda item: item.get("value", ""), reverse=sort_reverse) + elif callable(sort): + items.sort(key=lambda item: sort(item)) + + def sort_results(data: dict, level: int = 0) -> None: + nonlocal sort + if isinstance(sort, list) and sort: + if level < len(sort): + sort_level = sort[level] + else: + sort_level = sort[len(sort) - 1] + else: + sort_level = sort + if isinstance(data, dict) and isinstance(items := data.get("items"), list): + sort_items(items, sort=sort_level) + for item in items: + sort_results(item, level=level + 1) + + sort_results(data) diff --git a/src/encoded/endpoints/endpoint_utils.py b/src/encoded/endpoints/endpoint_utils.py new file mode 100644 index 000000000..b518e3ea3 --- /dev/null +++ b/src/encoded/endpoints/endpoint_utils.py @@ -0,0 +1,245 @@ +import calendar +from datetime import date, datetime +from dateutil.relativedelta import relativedelta +from pyramid.request import Request as PyramidRequest +from typing import Any, List, Optional, Tuple, Union +from urllib.parse import parse_qs, urlencode +from dcicutils.datetime_utils import parse_datetime_string as dcicutils_parse_datetime_string + + +def request_arg(request: PyramidRequest, name: str, fallback: Optional[str] = None) -> Optional[str]: + return str(value).strip() if (value := request.params.get(name, None)) is not None else fallback + + +def request_arg_int(request: PyramidRequest, name: str, fallback: Optional[int] = 0) -> Optional[Any]: + if (value := request_arg(request, name)) is not None: + try: + return int(value) + except Exception: + pass + return fallback + + +def request_arg_bool(request: PyramidRequest, name: str, fallback: Optional[bool] = False) -> Optional[bool]: + return fallback if (value := request_arg(request, name)) is None else (value.lower() == "true") + + +def request_args(request: PyramidRequest, + name: str, fallback: Optional[str] = None, duplicates: bool = False) -> List[str]: + args = [] + if isinstance(value := request.params.getall(name), list): + # Note that request.paramss.getall always returns a list, + # even if the named query parameter is not specified at all. + if value == []: + if request.params.get(name) is None: + # Only return the fallback if the named query parameter was not specified at all. + return fallback + for item in value: + if isinstance(item, str) and (item := item.strip()): + if (item not in args) or (duplicates is True): + args.append(item) + return args + + +def parse_date_range_related_arguments( + from_date: Optional[Union[str, datetime, date]], + thru_date: Optional[Union[str, datetime, date]], + nmonths: Optional[Union[str, int]] = None, + include_current_month: Optional[bool] = True, + strings: bool = False) -> Tuple[Optional[Union[str, datetime]], Optional[Union[str, datetime]]]: + + """ + Returns from/thru dates based on the given from/thru date arguments and optional nmonths argument. + Given dates may be datetime or date objects or strings. Returned dates are datetime objects, or + if the the given strings arguments is True, then strings (formatted as YYYY-MM-DD). + + If BOTH of the given from/thru dates are specified/valid then those are parsed and returned; + and the given nmonths and include_current_month arguments are NOT used in this case. + + Note that the include_current_month argument is used ONLY if NEITHER from NOR thru date + are specified; and note that its default value is True. + + If only the given from date is specified then a None thru date is returned, UNLESS the given nmonths + argument represents a positive integer, in which case the returned thru date will be nmonths months + subsequent to the given from date; or if the given nmonths represents zero, in which case the + returned thru date will be the last date of the month of the given from date. + + If only the given thru date is specified then a None from date is returned, UNLESS the given nmonths + argument represents a negative integer, in which case the returned from date will be nmonths monthss + previous to the given thru date; or if the given nmonths represents zero, in which case + the returned from date will be the first date of the month of the given thru date. + + If neither the given from/thru dates are specified then None is returns for both, UNLESS the given + nmonths arguments represents a non-zero integer, in which case the returned from/thru dates will represent + the past (absolute value) nmonths months starting with the month previous to the month of "today"; however + if the include_current_month is True it is rather the past nmonths starting with the month of "today". + + FYI WRT smaht-portal/elasticsearch behavior and dates, when using a query like date_created.from=2024-11-01 + and date_created.to=2024-10-31, what is actually passed to the elasticsearch filter/range query looks like: + + "range": { + "date_created": { + "gte": "2024-10-31 00:00", + "lte": "lte": "2024-12-31 23:59" + } + } + + I.e. so the "from" date is from the very BEGINNING of the date/day (00:00) and and greater-than-or-EQUAL + to and the "thru" date is thru the very END of the date/day (23:59). This is actually done by the method + snovault.search.lucene_builder.LuceneBuilder.handle_range_filters. + """ + from_date = parse_datetime_string(from_date, notz=True) + thru_date = parse_datetime_string(thru_date, last_day_of_month_if_no_day=True, notz=True) + if nmonths is None: + nmonths = 0 + nmonths_none = True + else: + nmonths_none = False + if not isinstance(nmonths, int): + if isinstance(nmonths, str) and (nmonths := nmonths.strip()): + try: + nmonths = int(nmonths) + except Exception: + nmonths = 0 + else: + nmonths = 0 + if from_date: + if (not thru_date) and isinstance(nmonths, int): + if nmonths > 0: + thru_date = _add_months(from_date, nmonths) + elif (nmonths == 0) and (not nmonths_none): + thru_date = _get_last_date_of_month(from_date) + elif thru_date: + if isinstance(nmonths, int): + if nmonths < 0: + from_date = _add_months(thru_date, nmonths) + elif (nmonths == 0) and (not nmonths_none): + from_date = _get_first_date_of_month(thru_date) + elif ((nmonths := abs(nmonths)) != 0) or (include_current_month is not False): + # If no (valid) from/thru dates given, but the absolute value of nmonths is a non-zero integer, then returns + # from/thru dates for the last nmonths month ending with the last day of month previous to the current month. + # thru_date = _add_months(_get_last_date_of_month(), -1) + thru_date = _get_last_date_of_month() + if include_current_month is False: + thru_date = _get_last_date_of_month(_add_months(thru_date, -1)) + nmonths -= 1 + from_date = _add_months(thru_date, -nmonths) + from_date = _get_first_date_of_month(from_date) + if strings is True: + from_date = from_date.strftime(f"%Y-%m-%d") if from_date else None + thru_date = thru_date.strftime(f"%Y-%m-%d") if thru_date else None + if from_date and thru_date and thru_date < from_date: + from_date, thru_date = thru_date, from_date + return from_date, thru_date + + +def parse_datetime_string(value: Union[str, datetime, date], + last_day_of_month_if_no_day: bool = False, + notz: bool = False) -> Optional[datetime]: + """ + Wrapper around dcicutils.datetime_utils.parse_datetime_string to handle a few special cases for convenience. + """ + last_day_of_month = False + if not isinstance(value, datetime): + if isinstance(value, date): + value = datetime.combine(value, datetime.min.time()) + elif isinstance(value, str): + if (len(value) == 8) and value.isdigit(): + # Special case to accept for example "20241206" to mean "2024-12-06". + value = f"{value[0:4]}-{value[4:6]}-{value[6:8]}" + elif (len(value) == 7) and (value[4] == "-") and value[0:4].isdigit() and value[5:].isdigit(): + # Special case to accept for example "2024-10" to mean "2024-10-01". + value = f"{value}-01" + last_day_of_month = last_day_of_month_if_no_day + elif (len(value) == 6) and value[0:4].isdigit() and value[4:].isdigit(): + # Special case to accept for example "202410" to mean "2024-10-01". + value = f"{value[0:4]}-{value[4:]}-01" + last_day_of_month = last_day_of_month_if_no_day + elif (len(value) == 7) and (value[2] == "/") and value[0:2].isdigit() and value[3:].isdigit(): + # Special case to accept for example "11/2024" to mean "2024-11-01". + value = f"{value[3:]}-{value[0:2]}-01" + last_day_of_month = last_day_of_month_if_no_day + elif (len(value) == 6) and (value[1] == "/") and value[0:1].isdigit() and value[2:].isdigit(): + # Special case to accept for example "9/2024" to mean "2024-09-01". + value = f"{value[2:]}-0{value[0:1]}-01" + last_day_of_month = last_day_of_month_if_no_day + if not (value := dcicutils_parse_datetime_string(value)): + return None + else: + return None + value = value.replace(tzinfo=None) if notz is True else value + if last_day_of_month: + value = _get_last_date_of_month(value) + return value + + +def get_date_range_for_month( + date: Union[str, datetime, date], + strings: bool = False) -> Tuple[Optional[Union[str, datetime]], Optional[Union[str, datetime]]]: + if date := parse_datetime_string(date, notz=True): + from_date = _get_first_date_of_month(date) + thru_date = _get_last_date_of_month(date) + if strings is True: + from_date = from_date.strftime(f"%Y-%m-%d") if from_date else None + thru_date = thru_date.strftime(f"%Y-%m-%d") if thru_date else None + else: + from_date = thru_date = None + return from_date, thru_date + + +def _get_first_date_of_month(day: Optional[Union[datetime, date, str]] = None) -> datetime: + """ + Returns a datetime object representing the first day of the month of the given date; + this given date may be a datetime or date object, or string representing a date or + datetime; if the given argument is unspecified or incorrect then assumes "today". + """ + if not (day := parse_datetime_string(day, notz=True)): + day = datetime.today().replace(tzinfo=None) + return day.replace(day=1) + + +def _get_last_date_of_month(day: Optional[Union[datetime, date, str]] = None) -> datetime: + """ + Returns a datetime object representing the last day of the month of the given date; + this given date may be a datetime or date object, or string representing a date or + datetime; if the given argument is unspecified or incorrect then assumes "today". + """ + if not (day := parse_datetime_string(day)): + day = datetime.today().replace(tzinfo=None) + return datetime(day.year, day.month, calendar.monthrange(day.year, day.month)[1]) + + +def _add_months(day: Optional[Union[datetime, date, str]] = None, nmonths: int = 0) -> datetime: + """ + Returns a datetime object representing the given date with the given nmonths number of months + added (or substracted if negative) to (or from) that given date.; this given date may be a + datetime or date object, or string representing a date or datetime; if the given argument + is unspecified or incorrect then assumes "today". + """ + if not (day := parse_datetime_string(day, notz=True)): + day = datetime.today().replace(tzinfo=None) + if isinstance(nmonths, int) and (nmonths != 0): + return day + relativedelta(months=nmonths) + return day + + +def create_query_string(query_arguments: dict, base: Optional[str] = None) -> str: + query_string = "" + if isinstance(query_arguments, dict): + if query_arguments := {key: value for key, value in query_arguments.items() if value is not None}: + query_string = urlencode(query_arguments, True) + # Hackishness to change "=!" to "!=" in query_string value for e.g. to turn this + # {"data_category": ["!Quality Control"]} into this: data_category&21=Quality+Control + query_string = query_string.replace("=%21", "%21=") + if isinstance(base, str) and base: + query_string = f"{base}?{query_string}" if query_string else base + return query_string + + +def deconstruct_query_string(query_string: str) -> dict: + if isinstance(query_string, str): + if (question_mark_index := query_string.find("?")) >= 0: + query_string = query_string[question_mark_index + 1:] + query_string = query_string.replace("%21=", "=%21") + return {key: value[0] if len(value) == 1 else value for key, value in parse_qs(query_string).items()} + return {} diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py new file mode 100644 index 000000000..4c72af2ab --- /dev/null +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary.py @@ -0,0 +1,500 @@ +from copy import deepcopy +from pyramid.request import Request as PyramidRequest, Response as PyramidResponse +from typing import List, Optional +from dcicutils.misc_utils import normalize_spaces +from encoded.endpoints.elasticsearch_utils import ( + add_debugging_to_elasticsearch_aggregation_query, + create_elasticsearch_aggregation_query, + merge_elasticsearch_aggregation_results, + normalize_elasticsearch_aggregation_results, + prune_elasticsearch_aggregation_results, + sort_normalized_aggregation_results, + AGGREGATION_MAX_BUCKETS, AGGREGATION_NO_VALUE) +from encoded.endpoints.endpoint_utils import ( + request_arg, request_args, request_arg_bool, request_arg_int, + create_query_string, deconstruct_query_string, + get_date_range_for_month, parse_date_range_related_arguments) +from encoded.endpoints.recent_files_summary.recent_files_summary_fields import ( + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR) +from encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting import ( + add_info_for_troubleshooting, + get_normalized_aggregation_results_as_html_for_troublehshooting) +from snovault.search.search import search as snovault_search +from snovault.search.search_utils import make_search_subreq as snovault_make_search_subreq + +QUERY_FILE_TYPES = ["OutputFile"] +QUERY_FILE_STATUSES = ["released"] +QUERY_FILE_CATEGORIES = ["!Quality Control"] +QUERY_RECENT_MONTHS = 3 +QUERY_INCLUDE_CURRENT_MONTH = True +BASE_SEARCH_QUERY = "/search/" + + +def recent_files_summary_endpoint(context, request): + # This text=true support is purely for troublesooting purposes; it dumps + # terminal-like formatted output for the results returned by the query. + text = request_arg_bool(request, "text") + results = recent_files_summary(request, troubleshooting=text) + if text: + text_uuids = request_arg_bool(request, "text_uuids", True) + text_uuid_details = request_arg_bool(request, "text_uuid_details", True) + text_query = request_arg_bool(request, "text_query") + text_verbose = request_arg_bool(request, "text_verbose") + text_debug = request_arg_bool(request, "text_debug") + results = get_normalized_aggregation_results_as_html_for_troublehshooting(results, + uuids=text_uuids, + uuid_details=text_uuid_details, + query=text_query, + verbose=text_verbose, + debug=text_debug) + results = PyramidResponse(f"
{results}
", content_type='text/html') + return results + + +def recent_files_summary(request: PyramidRequest, troubleshooting: bool = True) -> dict: + """ + This supports the (new as of 2024-12) /recent_files_summary endpoint (for C4-1192) to return, + by default, info for files released withing the past three months grouped by release-date, + cell-line or donor, and file-description. The specific fields used for these groupings are: + + - release-date: file_status_tracking.released + - cell-line: file_sets.libraries.analytes.samples.sample_sources.cell_line.code + - donor: donors.display_title + - file-dsecription: release_tracker_description + + Note that release_tracker_description is a newer (2024-12) + calculated property - see PR-298 (branch: sn_file_release_tracker). + + By default the current (assuminging partial) month IS included, so we really return info for + the past FULL three months plus for whatever time has currently elapsed for the current month. + Use pass the include_current_month=false query argument to NOT include the current month. + + The number of months of data can be controlled using the nmonths query argument, e.g. nmonths=6. + + A specific date range can also be passed in e.g. using from_date=2024-08-01 and thru_date=2024-10-31. + + For testing purposes, a date field other than the default file_status_tracking.released can + also be specified using the date_property_name query argument. And file statuses other than + released can be queried for using one or more status query arguments, e.g. status=uploaded. + """ + + date_property_name = request_arg(request, "date_property_name", AGGREGATION_FIELD_RELEASE_DATE) + max_buckets = request_arg_bool(request, "max_buckets", AGGREGATION_MAX_BUCKETS) + include_queries = request_arg_bool(request, "include_queries", request_arg_bool(request, "include_query", True)) + include_missing = request_arg_bool(request, "include_missing", request_arg_bool(request, "novalues")) + nocells = request_arg_bool(request, "nocells", request_arg_bool(request, "nocell", True)) # N.B. default True + nomixtures = request_arg_bool(request, "nomixtures", request_arg_bool(request, "nomixture")) + nodonors = request_arg_bool(request, "nodonors", request_arg_bool(request, "nodonor")) + favor_donor = request_arg_bool(request, "favor_donor") + multi = request_arg_bool(request, "multi") + nosort = request_arg_bool(request, "nosort") + legacy = request_arg_bool(request, "legacy") + debug = request_arg_bool(request, "debug") + debug_query = request_arg_bool(request, "debug_query") + troubleshoot = request_arg_bool(request, "troubleshoot") + troubleshoot_elasticsearch = request_arg_bool(request, "troubleshoot_elasticsearch") + raw = request_arg_bool(request, "raw") + willrfix = request_arg_bool(request, "willrfix") + + if troubleshooting is True: + debug = True + troubleshoot = True + troubleshoot_elasticsearch = True + + def get_aggregation_field_grouping_cell_or_donor() -> List[str]: + # This specializes the aggregation query to group first by the cell-line field, + # and then alternatively (if a cell-line field does not exist) by the donor field. + # For troubleshooting/testing/or-maybe-if-we-change-our-minds we can alternatively + # look first for the donor field and then secondarily for the cell-line field. + nonlocal nocells, nomixtures, nodonors, favor_donor + aggregation_field_grouping_cell_or_donor = deepcopy(AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR) + if nocells: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_LINE) + if nomixtures: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_CELL_MIXTURE) + if nodonors: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) + if favor_donor: + aggregation_field_grouping_cell_or_donor.remove(AGGREGATION_FIELD_DONOR) + aggregation_field_grouping_cell_or_donor.insert(0, AGGREGATION_FIELD_DONOR) + return aggregation_field_grouping_cell_or_donor + + def create_base_query_arguments(request: PyramidRequest) -> dict: + + global QUERY_FILE_CATEGORIES, QUERY_FILE_STATUSES, QUERY_FILE_TYPES + + types = request_args(request, "type", QUERY_FILE_TYPES) + statuses = request_args(request, "status", QUERY_FILE_STATUSES) + categories = request_args(request, "category", QUERY_FILE_CATEGORIES) + + base_query_arguments = { + "type": types if types else None, + "status": statuses if statuses else None, + "data_category": categories if categories else None + } + + return {key: value for key, value in base_query_arguments.items() if value is not None} + + def create_query_arguments(request: PyramidRequest, base_query_arguments: Optional[dict] = None) -> str: + + global BASE_SEARCH_QUERY, QUERY_RECENT_MONTHS, QUERY_INCLUDE_CURRENT_MONTH + nonlocal date_property_name + + recent_months = request_arg_int(request, "nmonths", request_arg_int(request, "months", QUERY_RECENT_MONTHS)) + from_date = request_arg(request, "from_date") + thru_date = request_arg(request, "thru_date") + include_current_month = request_arg_bool(request, "include_current_month", QUERY_INCLUDE_CURRENT_MONTH) + + from_date, thru_date = parse_date_range_related_arguments(from_date, thru_date, nmonths=recent_months, + include_current_month=include_current_month, + strings=True) + query_arguments = { + f"{date_property_name}.from": from_date if from_date else None, + f"{date_property_name}.to": thru_date if from_date else None + } + + if isinstance(base_query_arguments, dict): + query_arguments = {**base_query_arguments, **query_arguments} + return query_arguments + + def create_query(request: PyramidRequest, base_query_arguments: Optional[dict] = None) -> str: + query_arguments = create_query_arguments(request, base_query_arguments) + query_string = create_query_string(query_arguments) + return f"{BASE_SEARCH_QUERY}?{query_string}" + + def create_aggregation_query(aggregation_fields: List[str]) -> dict: + + nonlocal date_property_name, max_buckets, include_missing, favor_donor, troubleshoot_elasticsearch + + aggregations = [] + if not isinstance(aggregation_fields, list): + aggregation_fields = [aggregation_fields] + for item in aggregation_fields: + if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): + aggregations.append(item) + if not aggregations: + return {} + + def create_field_aggregation(field: str) -> Optional[dict]: # noqa + nonlocal aggregation_field_grouping_cell_or_donor, date_property_name, multi + if field == date_property_name: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + elif field == AGGREGATION_FIELD_CELL_LINE: + # Note how we prefix the result with the aggregation field name; + # this is so later we can tell which grouping/field was matched; + # see fixup_names_values_for_normalized_results for this fixup. + script = "" + for aggregation_field_grouping_index in range(len(aggregation_field_grouping_cell_or_donor)): + aggregation_field = aggregation_field_grouping_cell_or_donor[aggregation_field_grouping_index] + if_or_else_if = "if" if aggregation_field_grouping_index == 0 else "else if" + # Note that if there are multiple values for the aggregation field just the "first" one will be chosen; + # where "first" means which was indexed first, which from an application POV is kind of arbitrary. + # If we want to make it more deterministic we could order the results (say) alphabetically like so: + # def value = doc['embedded.{aggregation_field}.raw'].stream().min((a, b) -> a.compareTo(b)).get(); + # return '{aggregation_field}:' + value; + # OR, if we actually want to aggregation on ALL values we could collect the results and return all like so: + # def values = []; + # for (value in doc['embedded.{aggregation_field}.raw']) { + # values.add('{aggregation_field}:' + value); + # } + # return values; + # But then we'd get double counting and so on. We are told in any case that these groups should be distinct. + if not multi: + script += f""" + {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ + return '{aggregation_field}:' + doc['embedded.{aggregation_field}.raw'].value; + }} + """ + else: + script += f""" + {if_or_else_if} (doc['embedded.{aggregation_field}.raw'].size() > 0) {{ + def values = []; + for (value in doc['embedded.{aggregation_field}.raw']) {{ + values.add('{aggregation_field}:' + value); + }} + return values; + }} + """ + script += f""" + else {{ + return 'unknown'; + }} + """ + return { + "terms": { + "script": { + "source": normalize_spaces(script), + "lang": "painless" + }, + "size": max_buckets + } + } + + def create_field_filter(field: str) -> Optional[dict]: # noqa + nonlocal aggregation_field_grouping_cell_or_donor + if field == AGGREGATION_FIELD_CELL_LINE: + filter = {"bool": {"should": [], "minimum_should_match": 1}} + for aggregation_field in aggregation_field_grouping_cell_or_donor: + filter["bool"]["should"].append({"exists": { "field": f"embedded.{aggregation_field}.raw"}}) + return filter + + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, + max_buckets=max_buckets, + missing_value=AGGREGATION_NO_VALUE, + include_missing=include_missing, + create_field_aggregation=create_field_aggregation, + create_field_filter=create_field_filter) + + if troubleshoot_elasticsearch: + add_debugging_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) + + return aggregation_query[date_property_name] + + def create_aggregation_query_legacy(aggregation_fields: List[str]) -> dict: + + nonlocal date_property_name, max_buckets, include_missing + + aggregations = [] + if not isinstance(aggregation_fields, list): + aggregation_fields = [aggregation_fields] + for item in aggregation_fields: + if isinstance(item, str) and (item := item.strip()) and (item not in aggregations): + aggregations.append(item) + if not aggregations: + return {} + + def create_field_aggregation(field: str) -> Optional[dict]: # noqa + nonlocal date_property_name + if field == date_property_name: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", + "format": "yyyy-MM", + "missing": "1970-01", + "order": {"_key": "desc"} + } + } + + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, + max_buckets=max_buckets, + missing_value=AGGREGATION_NO_VALUE, + include_missing=include_missing, + create_field_aggregation=create_field_aggregation) + + if troubleshoot_elasticsearch: + add_debugging_to_elasticsearch_aggregation_query(aggregation_query[date_property_name]) + + return aggregation_query[date_property_name] + + def execute_aggregation_query(request: PyramidRequest, query: str, aggregation_query: dict) -> str: + query += "&from=0&limit=0" # needed for aggregation query to not return the actual/individual item results. + request = snovault_make_search_subreq(request, path=query, method="GET") + results = snovault_search(None, request, custom_aggregations=aggregation_query) + return results + + def fixup_names_values_for_normalized_results(normalized_results: dict) -> None: + nonlocal aggregation_field_grouping_cell_or_donor + if isinstance(normalized_results, dict): + if isinstance(value := normalized_results.get("value"), str): + if ((separator_index := value.find(":")) > 0) and (value_prefix := value[0:separator_index]): + if value_prefix in aggregation_field_grouping_cell_or_donor: + if value := value[separator_index + 1:]: + normalized_results["name"] = value_prefix + normalized_results["value"] = value + if isinstance(items := normalized_results.get("items"), list): + for element in items: + fixup_names_values_for_normalized_results(element) + + def add_queries_to_normalized_results(normalized_results: dict, base_query_arguments: dict) -> None: + global BASE_SEARCH_QUERY + nonlocal date_property_name, willrfix + if isinstance(normalized_results, dict): + if name := normalized_results.get("name"): + if value := normalized_results.get("value"): + if name == date_property_name: + # Special case for date value which is just year/month (e.g. 2024-12); + # we want to turn this into a date range query for the month; actually + # this is not a special case, this is the NORMAL case we are dealing with. + # from_date, thru_date = parse_date_range_related_arguments(value, None, nmonths=0, strings=True) + from_date, thru_date = get_date_range_for_month(value, strings=True) + if from_date and thru_date: + base_query_arguments = {**base_query_arguments, + f"{name}.from": from_date, f"{name}.to": thru_date} + else: + base_query_arguments = {**base_query_arguments, name: value} + if willrfix: + if name == AGGREGATION_FIELD_CELL_LINE: + base_query_arguments[AGGREGATION_FIELD_CELL_MIXTURE] = AGGREGATION_NO_VALUE + elif name == AGGREGATION_FIELD_DONOR: + base_query_arguments[AGGREGATION_FIELD_CELL_MIXTURE] = AGGREGATION_NO_VALUE + base_query_arguments[AGGREGATION_FIELD_CELL_LINE] = AGGREGATION_NO_VALUE + normalized_results["query"] = create_query_string(base_query_arguments, BASE_SEARCH_QUERY) + if isinstance(items := normalized_results.get("items"), list): + for element in items: + add_queries_to_normalized_results(element, base_query_arguments) + + aggregation_field_grouping_cell_or_donor = get_aggregation_field_grouping_cell_or_donor() + # The base_query_arguments does not contain the from/thru dates as this is used; + # this is used to construct the query-string for the individually grouped items which + # will have the from/thru dates specifically representing their place within the group. + base_query_arguments = create_base_query_arguments(request) + query = create_query(request, base_query_arguments) + + if not legacy: + aggregate_by_cell_line_property_name = "aggregate_by_cell_line" + aggregate_by_cell_line = [ + date_property_name, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregation_query = { + aggregate_by_cell_line_property_name: create_aggregation_query(aggregate_by_cell_line) + } + else: + aggregate_by_cell_line_property_name = "aggregate_by_cell_line" + aggregate_by_cell_line = [ + date_property_name, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregate_by_donor_property_name = "aggregate_by_donor" + aggregate_by_donor = [ + date_property_name, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + aggregation_query = { + aggregate_by_cell_line_property_name: create_aggregation_query_legacy(aggregate_by_cell_line), + aggregate_by_donor_property_name: create_aggregation_query_legacy(aggregate_by_donor) + } + + if debug_query: + return { + "query": query, + "query_arguments": deconstruct_query_string(query), + "aggregation_query_fields": [ + AGGREGATION_FIELD_RELEASE_DATE, + *get_aggregation_field_grouping_cell_or_donor(), + AGGREGATION_FIELD_FILE_DESCRIPTOR + ], + "aggregation_query": aggregation_query + } + + raw_results = execute_aggregation_query(request, query, aggregation_query) + + if raw: + # For debugging/troubleshooting only if raw=true then return raw ElasticSearch results. + # And note that unless we remove teh @id property we get redirected to the URL in this field, + # for example to: /search/?type=OutputFile&status=released&data_category%21=Quality+Control + # &file_status_tracking.released.from=2024-09-30 + # &file_status_tracking.released.to=2024-12-31&from=0&limit=0' + if "@id" in raw_results: + del raw_results["@id"] + return raw_results + + if not (raw_results := raw_results.get("aggregations")): + return {} + + if debug: + raw_results = deepcopy(raw_results) # otherwise may be overwritten by below + + prune_elasticsearch_aggregation_results(raw_results) + + if not legacy: + aggregation_results = raw_results.get(aggregate_by_cell_line_property_name) + else: + aggregation_results = merge_elasticsearch_aggregation_results(raw_results.get(aggregate_by_cell_line_property_name), + raw_results.get(aggregate_by_donor_property_name)) + + # Note that the doc_count values returned by ElasticSearch DO actually seem to be for UNIQUE items, + # i.e. if an item appears in two different groups (e.g. if, say, f2584000-f810-44b6-8eb7-855298c58eb3 + # has file_sets.libraries.analytes.samples.sample_sources.cell_line.code values for both HG00438 and HG005), + # then its doc_count will NOT be counted TWICE. This creates a situation where it might LOOK like the counts + # are WRONG in the MERGED (via returned merge_elasticsearch_aggregation_results) result set, where the outer + # item count may be than the sum of the individual counts within each sub-group. For example, the below result + # shows a top-level doc_count of 1, even though there are 2 documents, 1 in the HG00438 group and the other + # in the HG005 it would be because the same unique file has a cell_line.code of both HG00438 and HG005. + # { + # "meta": { "field_name": "file_status_tracking.released" }, + # "buckets": [ + # { + # "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 1, + # "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + # "meta": { "field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" }, + # "buckets": [ + # { "key": "HG00438", "doc_count": 1, + # "release_tracker_description": { + # "meta": { "field_name": "release_tracker_description" }, + # "buckets": [ + # { "key": "WGS Illumina NovaSeq X bam", "doc_count": 1 }, + # ] + # } + # }, + # { "key": "HG005", "doc_count": 1, + # "release_tracker_description": { + # "meta": { "field_name": "release_tracker_description" }, + # "buckets": [ + # { "key": "Fiber-seq PacBio Revio bam", "doc_count": 1 } + # ] + # } + # } + # ] + # } + # } + # ] + # } + + if debug: + additional_properties = { + "debug": { + "query": query, + "query_arguments": deconstruct_query_string(query), + "aggregation_query_fields": [ + AGGREGATION_FIELD_RELEASE_DATE, + *get_aggregation_field_grouping_cell_or_donor(), + AGGREGATION_FIELD_FILE_DESCRIPTOR + ], + "aggregation_query": aggregation_query, + "raw_results": raw_results, + "aggregation_results": deepcopy(aggregation_results) + } + } + else: + additional_properties = None + + normalized_results = normalize_elasticsearch_aggregation_results(aggregation_results, + additional_properties=additional_properties, + remove_empty_items=not include_missing) + if not legacy: + fixup_names_values_for_normalized_results(normalized_results) + if include_queries: + add_queries_to_normalized_results(normalized_results, base_query_arguments) + normalized_results["query"] = query + + if not nosort: + # We can sort on the aggregations by level; outermost/left to innermost/right. + # In our case the outermost is the date aggregation so sort taht by the key value, + # e.g. 2014-12, descending; and the rest of the inner levels by the default + # sorting which is by aggregation count descending and secondarily by the key value. + sort_normalized_aggregation_results(normalized_results, ["-key", "default"]) + + if troubleshoot: + add_info_for_troubleshooting(normalized_results, request) + + return normalized_results diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_fields.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_fields.py new file mode 100644 index 000000000..c7a9e6a16 --- /dev/null +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_fields.py @@ -0,0 +1,16 @@ +# These are all the possible fields on which the /recent_files_summary endpoint can aggregate by. +# Various flags modify the specifics, for experimentation, troubleshooting, and possible future changes. + +AGGREGATION_FIELD_RELEASE_DATE = "file_status_tracking.released" +# FYI FWIW: There is also file_sets.libraries.analytes.samples.sample_sources.display_title; +# and that sometimes file_sets.libraries.analytes.samples.sample_sources.code does not exist. +AGGREGATION_FIELD_CELL_MIXTURE = "file_sets.libraries.analytes.samples.sample_sources.code" +AGGREGATION_FIELD_CELL_LINE = "file_sets.libraries.analytes.samples.sample_sources.cell_line.code" +AGGREGATION_FIELD_DONOR = "donors.display_title" +AGGREGATION_FIELD_FILE_DESCRIPTOR = "release_tracker_description" + +AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR = [ + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_DONOR +] diff --git a/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py new file mode 100644 index 000000000..7c71391fe --- /dev/null +++ b/src/encoded/endpoints/recent_files_summary/recent_files_summary_troubleshooting.py @@ -0,0 +1,550 @@ +from contextlib import contextmanager +from copy import deepcopy +from pyramid.request import Request as PyramidRequest +from termcolor import colored +from typing import Any, Callable, List, Optional, Tuple, Union +from unittest.mock import patch as patch +from encoded.endpoints.endpoint_utils import parse_datetime_string +from encoded.endpoints.recent_files_summary.recent_files_summary_fields import ( + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR) + +def add_info_for_troubleshooting(normalized_results: dict, request: PyramidRequest) -> None: + + def get_files(files, property_name, property_value, map_property_value = None): + found = [] + for file in files: + if properties := _get_properties(file, property_name): + if callable(map_property_value): + mapped_properties = [] + for value in properties: + mapped_properties.append(map_property_value(value)) + properties = mapped_properties + if property_value in properties: + found.append(file) + return found + + def map_date_property_value(value): + if date_value := parse_datetime_string(value): + return f"{date_value.year}-{date_value.month:02}" + return value + + def count_uuid(uuid_records: List[dict], uuid: str) -> int: + count = 0 + for uuid_record in uuid_records: + if uuid_record.get("uuid") == uuid: + count += 1 + return count + + def dedup_list(data: list) -> list: # noqa + return list(dict.fromkeys(data)) if isinstance(data, list) else [] + + aggregation_fields_for_troubleshooting = dedup_list([ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_MIXTURE, + AGGREGATION_FIELD_CELL_LINE, + # Store some extra properties for troublehooting (as this whole thing is). + "file_sets.libraries.analytes.samples.sample_sources.display_title", + AGGREGATION_FIELD_DONOR, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ]) + + def annotate_with_uuids(normalized_results: dict): + + def get_unique_release_tracker_description_values(normalized_results: dict) -> List[str]: + return _get_properties(normalized_results, "items.items.items.value") + + nonlocal aggregation_fields_for_troubleshooting + unique_release_tracker_description_values = get_unique_release_tracker_description_values(normalized_results) + uuid_records = [] + query = normalized_results.get("query") + if isinstance(normalized_results.get("debug"), dict): + normalized_results["debug"]["aggregation_fields_for_troubleshooting"] = ( + aggregation_fields_for_troubleshooting) + files = request.embed(f"{query}&limit=1000", as_user="IMPORT")["@graph"] + for first_item in normalized_results["items"]: + first_property_name = first_item["name"] + first_property_value = first_item["value"] + for second_item in first_item["items"]: + second_property_name = second_item["name"] + second_property_value = second_item["value"] + second_item_items = second_item["items"] + # Put dummy elements in for AGGREGATION_FIELD_FILE_DESCRIPTOR items values which do not exist. + third_item_values = [third_item["value"] for third_item in second_item_items] + for unique_release_tracker_description_value in unique_release_tracker_description_values: + if unique_release_tracker_description_value not in third_item_values: + second_item["items"].append({ + "name": AGGREGATION_FIELD_FILE_DESCRIPTOR, + "value": unique_release_tracker_description_value, + "count": 0, + "elasticsearch_counted": False, + "debug_placeholder": True + }) + third_items_to_delete = [] + for third_item in second_item_items: + third_property_name = third_item["name"] + third_property_value = third_item["value"] + if debug_elasticsearch_hits := third_item.get("debug_elasticsearch_hits"): + if not third_item.get("debug"): + third_item["debug"] = {} + third_item["debug"]["elasticsearch_hits"] = debug_elasticsearch_hits + third_item["debug"]["elasticsearch_hits"].sort() + del third_item["debug_elasticsearch_hits"] + if first_files := get_files(files, first_property_name, first_property_value, + map_property_value=map_date_property_value): + if second_files := get_files(first_files, second_property_name, second_property_value): + if third_files := get_files(second_files, third_property_name, third_property_value): + for file in third_files: + if isinstance(uuid := file.get("uuid"), str): + if not third_item.get("debug"): + third_item["debug"] = {} + if not third_item["debug"].get("portal_hits"): + third_item["debug"]["portal_hits"] = [] + uuid_record = {"uuid": uuid} + for aggregation_field in aggregation_fields_for_troubleshooting: + aggregation_values = ", ".join(_get_properties(file, aggregation_field)) + uuid_record[aggregation_field] = aggregation_values or None + if third_item["debug"].get("elasticsearch_hits"): + uuid_record["elasticsearch_counted"] = \ + uuid in third_item["debug"]["elasticsearch_hits"] + third_item["debug"]["portal_hits"].append(uuid_record) + uuid_records.append(uuid_record) + if third_item.get("debug", {}).get("portal_hits"): + third_item["debug"]["portal_hits"].sort(key=lambda item: item.get("uuid")) + if ((third_item.get("count") == 0) and + (third_item.get("debug_placeholder") is True) and + (not third_item.get("debug", {}).get("elasticsearch_hits")) and + (not third_item.get("debug", {}).get("portal_hits"))): # noqa + third_items_to_delete.append(third_item) + if third_items_to_delete: + for third_item in third_items_to_delete: + second_item_items.remove(third_item) + + for uuid_record in uuid_records: + if (count := count_uuid(uuid_records, uuid_record["uuid"])) > 1: + uuid_record["duplicative"] = count + + try: + annotate_with_uuids(normalized_results) + except Exception: + pass + + +def get_normalized_aggregation_results_as_html_for_troublehshooting(normalized_results: dict, + uuids: bool = True, + uuid_details: bool = True, + query: bool = False, + verbose: bool = False, + debug: bool = False): + with _capture_output_to_html(debug=debug) as captured_output: + print_normalized_aggregation_results_for_troubleshooting(normalized_results, + uuids=uuids, + uuid_details=uuid_details, + query=query, + verbose=verbose) + return captured_output.text + + +def print_normalized_aggregation_results_for_troubleshooting(normalized_results: dict, + title: Optional[str] = None, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None, + uuids: bool = False, + uuid_details: bool = False, + nobold: bool = False, + checks: bool = False, + query: bool = False, + verbose: bool = False) -> None: + + """ + For deveopment/troubleshooting only ... + """ + def get_aggregation_fields(normalized_results: dict) -> List[str]: + # Returns all noted/important aggregation fields which ARE actually being used by the query; + # we only are interested in ones that are in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR, + # which is all of the possible sample-source/cell-line/donor aggregations. + if not isinstance(aggregation_fields := + normalized_results.get("debug", {}).get("aggregation_query_fields"), list): + aggregation_fields = [] + else: + aggregation_fields = deepcopy(aggregation_fields) + for aggregation_field in aggregation_fields: + # Remove the ones we are not interested in reporting on. + if aggregation_field not in AGGREGATION_FIELD_GROUPING_CELL_OR_DONOR: + aggregation_fields.remove(aggregation_field) + return aggregation_fields + + def get_aggregation_fields_to_print(normalized_results: dict) -> List[str]: + aggregation_fields_to_print = get_aggregation_fields(normalized_results) + if isinstance(aggregation_fields_for_troubleshooting := + normalized_results.get("debug", {}).get("aggregation_fields_for_troubleshooting"), list): + for aggregation_field_for_troubleshooting in aggregation_fields_for_troubleshooting: + if aggregation_field_for_troubleshooting not in aggregation_fields_to_print: + aggregation_fields_to_print.append(aggregation_field_for_troubleshooting) + aggregation_fields_to_not_print = [ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + for aggregation_field_to_not_print in aggregation_fields_to_not_print: + if aggregation_field_to_not_print in aggregation_fields_to_print: + aggregation_fields_to_print.remove(aggregation_field_to_not_print) + return aggregation_fields_to_print + + def get_aggregation_field_labels() -> dict: + # Shorter/nicer names for aggregation fields of interest to print. + return { + AGGREGATION_FIELD_CELL_MIXTURE: "sample-sources", + AGGREGATION_FIELD_CELL_LINE: "cell-lines", + AGGREGATION_FIELD_DONOR: "donors", + "file_sets.libraries.analytes.samples.sample_sources.display_title": "sample-sources-title" + } + + def print_results(data: dict, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None, + indent: int = 0) -> None: + + nonlocal title, uuids, uuid_details, nobold, query, verbose + nonlocal chars_check, chars_dot, chars_rarrow_hollow, chars_xmark, red, green, green_bold, gray, bold + nonlocal aggregation_fields_to_print + + def get_portal_hits(data: dict) -> List[dict]: + hits = [] + if isinstance(data, dict) and isinstance(portal_hits := data.get("debug", {}).get("portal_hits"), list): + for portal_hit in portal_hits: + if isinstance(portal_hit, dict) and isinstance(uuid := portal_hit.get("uuid"), str) and uuid: + hits.append(portal_hit) + return hits + + def count_unique_portal_hits_recursively(data: dict) -> int: + def get_portal_hits_recursively(data: dict) -> List[dict]: # noqa + hits = [] + if isinstance(data, dict): + for key in data: + if key == "portal_hits": + if isinstance(data[key], list): + hits.extend(data[key]) + else: + hits.extend(get_portal_hits_recursively(data[key])) + elif isinstance(data, list): + for element in data: + hits.extend(get_portal_hits_recursively(element)) + return hits + hits = get_portal_hits_recursively(data) + hits = [hit.get("uuid") for hit in hits] + return len(set(hits)) + + def format_hit_property_values(hit: dict, property_name: str, + color: Optional[Callable] = None) -> Tuple[Optional[str], List[Tuple[str, str]]]: + nonlocal parent_grouping_name, parent_grouping_value, green, green_bold, chars_larrow_hollow + counted_elsewhere = [] + if hit.get("elasticsearch_counted", False) is False: + counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) + else: + counted_grouping_name, counted_grouping_value = (None, None) + if property_value := hit.get(property_name): + if property_name == parent_grouping_name: + property_values = [] + for property_value in property_value.split(","): + if (property_value := property_value.strip()) == parent_grouping_value: + property_value = color(property_value) if callable(color) else green_bold(property_value) + property_values.append(property_value) + else: + if (counted_grouping_name, counted_grouping_value) == (property_name, property_value): + property_values.append(green_bold(f"{property_value} {chars_larrow_hollow}") + + green(" COUNTED HERE")) + counted_elsewhere.append((counted_grouping_name, counted_grouping_value)) + else: + property_values.append(property_value) + property_value = ", ".join(property_values) + elif hit.get("elasticsearch_counted", False) is False: + counted_grouping_name, counted_grouping_value = find_where_aggregated_and_counted(hit.get("uuid")) + if (counted_grouping_name == property_name) and (counted_grouping_value == property_value): + property_value = green_bold(f"{property_value} {chars_larrow_hollow}") + green(" COUNTED HERE") + counted_elsewhere.append((counted_grouping_name, counted_grouping_value)) + return property_value, counted_elsewhere + + def find_where_aggregated_and_counted( + uuid: str, + multiple: bool = False, + ignore: Optional[Union[List[Tuple[str, str]], + Tuple[str, str]]] = None) -> Union[Tuple[str, str], List[Tuple[str, str]]]: + + nonlocal normalized_results + + def find_where(data: dict, uuid: str, + parent_grouping_name: Optional[str] = None, + parent_grouping_value: Optional[str] = None) -> List[Tuple[str, str]]: + found_uuid_grouping_names_and_values = set() + if isinstance(data, dict): + grouping_name = data.get("name") + grouping_value = data.get("value") + if isinstance(items := data.get("items"), list): + for item in items: + if found := find_where(item, uuid, + parent_grouping_name=grouping_name, + parent_grouping_value=grouping_value): + found_uuid_grouping_names_and_values.update(found) + elif isinstance(hits := data.get("debug", {}).get("portal_hits"), list): + for hit in hits: + if hit.get("uuid") == uuid: + if hit.get("elasticsearch_counted", False) is True: + found_uuid_grouping_names_and_values.add((parent_grouping_name, parent_grouping_value)) + return found_uuid_grouping_names_and_values + + if found_uuid_grouping_names_and_values := list(find_where(normalized_results, uuid)): + if isinstance(ignore, tuple) and (len(ignore) == 2) and (ignore in found_uuid_grouping_names_and_values): + found_uuid_grouping_names_and_values.remove(ignore) + elif isinstance(ignore, list): + for ignore_item in ignore: + if isinstance(ignore_item, tuple) and (len(ignore_item) == 2) and (ignore_item in found_uuid_grouping_names_and_values): + found_uuid_grouping_names_and_values.remove(ignore_item) + if multiple is True: + return found_uuid_grouping_names_and_values + if len(found_uuid_grouping_names_and_values) > 1: + # Normally should only be at most one item with elasticsearch_counted set to True. + pass + return found_uuid_grouping_names_and_values[0] + return [(None, None)] if multiple is True else (None, None) + + def print_hit_property_values(hit: dict, property_name: str, + label: Optional[str] = None, + prefix: Optional[str] = None, + color: Optional[Callable] = None) -> List[Tuple[str, str]]: + nonlocal aggregation_fields, aggregation_field_labels, chars_dot_hollow, chars_null, verbose + if not label: + label = aggregation_field_labels.get(property_name) + if (verbose is True) or (not label): + label = property_name + property_values, counted_elsewhere = format_hit_property_values(hit, property_name, color=color) + if not property_values: + property_values = chars_null + if property_name not in aggregation_fields: + property_description = f"{prefix or ''}{chars_dot_hollow} {label}: {property_values}" + property_description = gray(property_description) + else: + property_description = f"{prefix or ''}{chars_dot} {label}: {property_values}" + print(property_description) + return counted_elsewhere + + if not (isinstance(data, dict) and data): + return + if not (isinstance(indent, int) and (indent > 0)): + indent = 0 + spaces = (" " * indent) if indent > 0 else "" + grouping_name = data.get("name") + if isinstance(grouping_value := data.get("value"), str) and grouping_value: + grouping = bold(grouping_value) + if (verbose is True) and isinstance(grouping_name, str) and grouping_name: + grouping = f"{grouping_name} {chars_dot} {grouping}" + elif not (isinstance(grouping := title, str) and grouping): + grouping = "RESULTS" + grouping = f"{chars_diamond} {grouping}" + hits = get_portal_hits(data) if (uuids is True) else [] + note = "" + if isinstance(count := data.get("count"), int): + if (len(hits) > count) and (uuids is True): + note = red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {len(hits) - count}") + if count == 0: + note = red(f' {chars_rarrow_hollow} UNCOUNTED') + note + elif isinstance(items := data.get("items"), list): + subcount = 0 + for item in items: + if isinstance(subcount_item := item.get("count"), int): + subcount += subcount_item + if subcount != count: + note = red(f" {chars_xmark} ACTUAL COUNT: {subcount}") + elif checks is True: + note = f" {chars_check}" + elif checks: + note = f" {chars_check}" + if not ((count == 0) and (len(hits) == 0) and (not note)): + if (len(hits) == 0) and isinstance(items := data.get("items"), list): + # Count the actual hits for this non-terminal group. + if ((items_nhits := count_unique_portal_hits_recursively(items)) > count) and (uuids is True): + note += red(f" {chars_rarrow_hollow} MORE ACTUAL RESULTS: {items_nhits - count}") + if count == 0: + note = red(f' {chars_rarrow_hollow} UNCOUNTED') + note + print(f"{spaces}{grouping}: {count}{note}") + if (query is True) and (query_string := data.get("query")): + if _terminal_color == _html_color: + print(f"{spaces} {query_string}") + else: + print(f"{spaces} {query_string}") + for hit in hits: + if isinstance(hit, dict) and isinstance(uuid := hit.get("uuid"), str) and uuid: + if hit.get("elasticsearch_counted", False) is False: + print(red(f"{spaces} {chars_dot} {uuid} {chars_xmark} UNCOUNTED")) + color = red_bold + else: + print(f"{spaces} {chars_dot} {uuid} {chars_check}") + color = green_bold + if uuid_details is True: + prefix = f"{spaces} " + counted_elsewhere = [] + # Show property values for troubleshooting (as this whole thing is); + # see add_info_for_troubleshooting.annotate_with_uuids. + for aggregation_field in aggregation_fields_to_print: + hit_counted_elsewhere = \ + print_hit_property_values(hit, aggregation_field, prefix=prefix, color=color) + if hit_counted_elsewhere: + counted_elsewhere.extend(hit_counted_elsewhere) + # See if also grouped elsewhere for our FYI. + duplicative = hit.get("duplicative") + duplicates = duplicative - 1 if isinstance(duplicative, int) else 0 + counted_groupings = find_where_aggregated_and_counted( + hit.get("uuid"), multiple=True, + ignore=counted_elsewhere + [(parent_grouping_name, parent_grouping_value)]) + if counted_groupings: + message = f"{spaces} {green(chars_rarrow_hollow)} {green('ALSO COUNTED HERE')}:" + if verbose is True: + if duplicates > 0: + message += f" {duplicates}" + if duplicates != len(counted_groupings): + message += red_bold(f" {chars_xmark} vs {len(counted_groupings)}") + print(message) + for counted_grouping in counted_groupings: + print(f"{spaces} - {counted_grouping[0]} {green(counted_grouping[1])}") + else: + counted_grouping_values = [green(counted_grouping[1]) for counted_grouping in counted_groupings] + message = f"{message} {', '.join(counted_grouping_values)}" + if duplicates > 0: + if duplicates != len(counted_groupings): + message += red_bold(f" {chars_xmark} {duplicates} vs {len(counted_grouping_values)}") + print(message) + if isinstance(items := data.get("items"), list): + for element in items: + print_results(element, + parent_grouping_name=grouping_name, + parent_grouping_value=grouping_value, + indent=indent + 2) + + aggregation_fields = get_aggregation_fields(normalized_results) + aggregation_fields_to_print = get_aggregation_fields_to_print(normalized_results) + aggregation_field_labels = get_aggregation_field_labels() + + red = lambda text: _terminal_color(text, "red") # noqa + red_bold = lambda text: _terminal_color(text, "red", bold=True) # noqa + green = lambda text: _terminal_color(text, "green") # noqa + green_bold = lambda text: _terminal_color(text, "green", bold=True) # noqa + gray = lambda text: _terminal_color(text, "grey") # noqa + bold = (lambda text: _terminal_color(text, bold=True)) if (nobold is not True) else (lambda text: text) + chars_check = "✓" + chars_xmark = "✗" + chars_dot = "•" + chars_dot_hollow = "◦" + chars_diamond = "❖" + chars_rarrow_hollow = "▷" + chars_larrow_hollow = "◁" + chars_null = "∅" + + print_results(normalized_results) + + +def _get_properties(data: dict, name: str, fallback: Optional[Any] = None, sort: bool = False) -> List[Any]: + """ + TODO: Move this to dcicutils. Maybe much of the above too. + Returns the values of the given property name within the given dictionary as a list, where the + given property name can be a dot-separated list of property names, which indicate a path into + nested dictionaries within the given dictionary; and - where if any of the elements within + the path are lists then we iterate through each, collecting the values for each and including + each within the list of returned values. + """ + if isinstance(data, dict) and isinstance(name, str) and name: + if keys := name.split("."): + nkeys = len(keys) ; key_index_max = nkeys - 1 # noqa + for key_index in range(nkeys): + if (value := data.get(keys[key_index], None)) is not None: + if key_index == key_index_max: + return [value] if not isinstance(value, list) else value + elif isinstance(value, dict): + data = value + continue + elif isinstance(value, list) and value and ((sub_key_index := key_index + 1) < nkeys): + sub_key = ".".join(keys[sub_key_index:]) + values = [] + for element in value: + if isinstance(element_value := _get_properties(element, sub_key), list): + for element_value_item in element_value: + if (element_value_item is not None) and (element_value_item not in values): + values.append(element_value_item) + elif (element_value is not None) and (element_value not in values): + values.append(element_value) + return sorted(values) if (sort is True) else values + break + return fallback if isinstance(fallback, list) else ([] if fallback is None else [fallback]) + + +def colored_html(value: str, color: Optional[str] = None, attrs: Optional[list] = None) -> str: + if isinstance(value, str): + if isinstance(color, str) and color: + value = f"{value}" + if isinstance(attrs, list): + if "bold" in attrs: + value = f"{value}" + return value + + +def _terminal_color(value: str, + color: Optional[str] = None, + dark: bool = False, + bold: bool = False, + underline: bool = False, + nocolor: bool = False) -> str: + # This is used only for troubleshooting by + if nocolor is True: + return value + attributes = [] + if dark is True: + attributes.append("dark") + if bold is True: + attributes.append("bold") + if underline is True: + attributes.append("underline") + if isinstance(color, str) and color: + return colored(value, color.lower(), attrs=attributes) + return colored(value, attrs=attributes) + + +def _html_color(value: str, + color: Optional[str] = None, + dark: bool = False, + bold: bool = False, + underline: bool = False, + nocolor: bool = False) -> str: + if (nocolor is not True) and isinstance(value, str): + if isinstance(color, str) and color: + if dark is True: + value = f"{value}" + else: + value = f"{value}" + if bold is True: + value = f"{value}" + if underline is True: + value = f"{value}" + return value + + +@contextmanager +def _capture_output_to_html(debug: bool = False): + + captured_output = "" + class CapturedOutput: # noqa + @property # noqa + def text(self): + nonlocal captured_output + return captured_output + def captured_print(*args, **kwargs): # noqa + nonlocal captured_output + captured_output += str(args[0]) + "\n" + this_module = "encoded.endpoints.recent_files_summary.recent_files_summary_troubleshooting" + if debug is True: + with patch(f"{this_module}.print", captured_print): + yield CapturedOutput() + else: + with (patch(f"{this_module}.print", captured_print), patch(f"{this_module}._terminal_color", _html_color)): + yield CapturedOutput() diff --git a/src/encoded/metadata.py b/src/encoded/metadata.py index 6a4badf79..4eba39721 100644 --- a/src/encoded/metadata.py +++ b/src/encoded/metadata.py @@ -287,6 +287,11 @@ def peek_metadata(context, request): """ Helper for the UI that will retrieve faceting information about data retrieved from /metadata """ # get arguments from helper args = handle_metadata_arguments(context, request) + if isinstance(args, Response): + # dmichaels/2024-12-16: Hackish fix for now; handle_metadata_arguments not returning MetadataArgs for ... + subreq = make_search_subreq(request, '{}?{}'.format('/search', urlencode(request.params, True)), inherit_user=True) + result = search(context, subreq) + return result['facets'] # Generate search search_param = {} diff --git a/src/encoded/static/components/browse/BrowseView.js b/src/encoded/static/components/browse/BrowseView.js new file mode 100644 index 000000000..08ce8ae84 --- /dev/null +++ b/src/encoded/static/components/browse/BrowseView.js @@ -0,0 +1,242 @@ +'use strict'; + +import React from 'react'; +import memoize from 'memoize-one'; +import _ from 'underscore'; +import url from 'url'; + +import { + memoizedUrlParse, + schemaTransforms, + analytics, +} from '@hms-dbmi-bgm/shared-portal-components/es/components/util'; +import { SearchView as CommonSearchView } from '@hms-dbmi-bgm/shared-portal-components/es/components/browse/SearchView'; +import { DetailPaneStateCache } from '@hms-dbmi-bgm/shared-portal-components/es/components/browse/components/DetailPaneStateCache'; +import { columnExtensionMap } from './columnExtensionMap'; +import { Schemas } from './../util'; +import { + TitleAndSubtitleBeside, + PageTitleContainer, + TitleAndSubtitleUnder, + pageTitleViews, + EditingItemPageTitle, +} from './../PageTitleSection'; + +export default function BrowseView(props) { + const { + context: { '@type': searchPageType = ['ItemSearchResults'] }, + } = props; + const isCaseSearch = searchPageType[0] === 'CaseSearchResults'; + + if (isCaseSearch) { + return ( + + + + ); + } + + return ; +} + +export class BrowseViewBody extends React.PureComponent { + /** + * Function which is passed into a `.filter()` call to + * filter context.facets down, usually in response to frontend-state. + * + * Currently is meant to filter out type facet if we're in selection mode, + * as well as some fields from embedded 'experiment_set' which might + * give unexpected results. + * + * @todo Potentially get rid of this and do on backend. + * + * @param {{ field: string }} facet - Object representing a facet. + * @returns {boolean} Whether to keep or discard facet. + */ + static filterFacet(facet, currentAction) { + // Set in backend or schema for facets which are under development or similar. + if (facet.hide_from_view) return false; + + // Remove the @type facet while in selection mode. + if (facet.field === 'type' && currentAction === 'selection') + return false; + + return true; + } + + /** Filter the `@type` facet options down to abstract types only (if none selected) for Search. */ + static transformedFacets(context, currentAction, schemas) { + // Clone/filter list of facets. + // We may filter out type facet completely at this step, + // in which case we can return out of func early. + const facets = context.facets.filter(function (facet) { + return BrowseViewBody.filterFacet(facet, currentAction); + }); + + // Find facet for '@type' + const searchItemTypes = + schemaTransforms.getAllSchemaTypesFromSearchContext(context); // "Item" is excluded + + if (searchItemTypes.length > 0) { + console.info( + "A (non-'Item') type filter is present. Will skip filtering Item types in Facet." + ); + // Keep all terms/leaf-types - backend should already filter down to only valid sub-types through + // nature of search itself. + + if (searchItemTypes.length > 1) { + const errMsg = + 'More than one "type" filter is selected. This is intended to not occur, at least as a consequence of interacting with the UI. Perhaps have entered multiple types into URL.'; + analytics.exception('CGAP SearchView - ' + errMsg); + console.warn(errMsg); + } + + return facets; + } + + const typeFacetIndex = _.findIndex(facets, { field: 'type' }); + if (typeFacetIndex === -1) { + console.error( + 'Could not get type facet, though some filter for it is present.' + ); + return facets; // Facet not present, return. + } + + // Avoid modifying in place. + facets[typeFacetIndex] = _.clone(facets[typeFacetIndex]); + + // Show only base types for when itemTypesInSearch.length === 0 (aka 'type=Item'). + facets[typeFacetIndex].terms = _.filter( + facets[typeFacetIndex].terms, + function (itemType) { + const parentType = schemaTransforms.getAbstractTypeForType( + itemType.key, + schemas + ); + return !parentType || parentType === itemType.key; + } + ); + + return facets; + } + + /** Not currently used. */ + static filteredFilters(filters) { + const typeFilterCount = filters.reduce(function (m, { field }) { + if (field === 'type') return m + 1; + return m; + }, 0); + return filters.filter(function ({ field, term }) { + if (field === 'type') { + if (term === 'Item') { + return false; + } + if (typeFilterCount === 1) { + return false; + } + } + return true; + }); + } + + constructor(props) { + super(props); + this.memoized = { + transformedFacets: memoize(BrowseViewBody.transformedFacets), + filteredFilters: memoize(BrowseViewBody.filteredFilters), + }; + } + + render() { + const { + isCaseSearch = false, + context, + currentAction, + schemas, + } = this.props; + + // We don't need full screen btn on CGAP as already full width. + const passProps = _.omit( + this.props, + 'isFullscreen', + 'toggleFullScreen', + 'isCaseSearch' + ); + + //const filters = BrowseView.filteredFilters(context.filters || []); + const facets = this.memoized.transformedFacets( + context, + currentAction, + schemas + ); + const tableColumnClassName = 'results-column col'; + const facetColumnClassName = 'facets-column col-auto'; + + return ( +
+ + HELLO: THIS IS BROWSE-VIEW! +
+ ); + } +} + +const BrowseViewPageTitle = React.memo(function BrowseViewPageTitle(props) { + const { context, schemas, currentAction, alerts } = props; + + if (currentAction === 'add') { + // Fallback unless any custom PageTitles registered for @type=SearchResults & currentAction=add + return ( + + ); + } + + if (currentAction === 'selection' || currentAction === 'multiselect') { + return ( + + + Selecting + + + ); + } + + const thisTypeTitle = schemaTransforms.getSchemaTypeFromSearchContext( + context, + schemas + ); + const subtitle = thisTypeTitle ? ( + + for {thisTypeTitle} + + ) : null; + + return ( + + + Search + + + ); +}); + +pageTitleViews.register(BrowseViewPageTitle, 'Browse'); +pageTitleViews.register(BrowseViewPageTitle, 'Browse', 'selection'); +pageTitleViews.register(BrowseViewPageTitle, 'Browse', 'add'); diff --git a/src/encoded/static/components/index.js b/src/encoded/static/components/index.js index 1227a21dd..e0d041ac9 100644 --- a/src/encoded/static/components/index.js +++ b/src/encoded/static/components/index.js @@ -27,6 +27,7 @@ import DocumentView from './item-pages/DocumentView'; import StaticSectionView from './item-pages/StaticSectionView'; import SMaHTSubmissionView from './forms/SMaHTSubmissionView'; import SearchView from './browse/SearchView'; +import BrowseView from './browse/BrowseView'; import FileView from './item-pages/FileView'; /** @@ -59,6 +60,10 @@ content_views.register(SearchView, 'Search'); content_views.register(SearchView, 'Search', 'selection'); content_views.register(SearchView, 'Search', 'multiselect'); +content_views.register(BrowseView, 'Browse'); +content_views.register(BrowseView, 'Browse', 'selection'); +content_views.register(BrowseView, 'Browse', 'multiselect'); + // Fallback for anything we haven't registered content_views.fallback = function () { return FallbackView; diff --git a/src/encoded/tests/test_elasticsearch_utils.py b/src/encoded/tests/test_elasticsearch_utils.py new file mode 100644 index 000000000..979d13272 --- /dev/null +++ b/src/encoded/tests/test_elasticsearch_utils.py @@ -0,0 +1,283 @@ +import pytest +from typing import Optional +from encoded.endpoints.elasticsearch_utils import ( + create_elasticsearch_aggregation_query, + merge_elasticsearch_aggregation_results, + normalize_elasticsearch_aggregation_results) +from encoded.endpoints.recent_files_summary.recent_files_summary import ( + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR) + +def test_create_elasticsearch_aggregation_query_a(): + + def create_field_aggregation(field: str) -> Optional[dict]: + if field == AGGREGATION_FIELD_RELEASE_DATE: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", "format": "yyyy-MM", + "missing": "1970-01", "order": {"_key": "desc"} + } + } + + aggregations = [ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, create_field_aggregation=create_field_aggregation) + + assert aggregation_query == { + "file_status_tracking.released": { + "meta": {"field_name": "file_status_tracking.released"}, + "filter": { + "bool": { + "must": [ + {"exists": {"field": "embedded.file_status_tracking.released.raw"}}, + {"exists": {"field": "embedded.file_sets.libraries.analytes.samples.sample_sources.cell_line.code.raw"}}, + {"exists": {"field": "embedded.release_tracker_description.raw"}} + ] + } + }, + "aggs": { + "dummy_date_histogram": { + "date_histogram": { + "field": "embedded.file_status_tracking.released", + "calendar_interval": "month", "format": "yyyy-MM", + "missing": "1970-01", "order": { "_key": "desc" } + }, + "aggs": { + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "terms": { + "field": "embedded.file_sets.libraries.analytes.samples.sample_sources.cell_line.code.raw", + "missing": "No value", "size": 100 + }, + "aggs": { + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "terms": { + "field": "embedded.release_tracker_description.raw", + "missing": "No value", "size": 100 + } + } + } + } + } + } + } + } + } + + +def test_create_elasticsearch_aggregation_query_b(): + + def create_field_aggregation(field: str) -> Optional[dict]: + if field == AGGREGATION_FIELD_RELEASE_DATE: + return { + "date_histogram": { + "field": f"embedded.{field}", + "calendar_interval": "month", "format": "yyyy-MM", + "missing": "1970-01", "order": {"_key": "desc"} + } + } + + aggregations = [ + AGGREGATION_FIELD_RELEASE_DATE, + AGGREGATION_FIELD_CELL_LINE, + AGGREGATION_FIELD_FILE_DESCRIPTOR + ] + + # Same as previous tests but with include_missing=True (no date_histogram complication). + aggregation_query = create_elasticsearch_aggregation_query( + aggregations, create_field_aggregation=create_field_aggregation, include_missing=True) + + assert aggregation_query == { + "file_status_tracking.released": { + "meta": {"field_name": "file_status_tracking.released"}, + "date_histogram": { + "field": "embedded.file_status_tracking.released", + "calendar_interval": "month", "format": "yyyy-MM", + "missing": "1970-01", "order": {"_key": "desc"} + }, + "aggs": { + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "terms": { + "field": "embedded.file_sets.libraries.analytes.samples.sample_sources.cell_line.code.raw", + "missing": "No value", "size": 100 + }, + "aggs": { + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "terms": { + "field": "embedded.release_tracker_description.raw", + "missing": "No value", "size": 100 + } + } + } + } + } + } + } + + +def test_merge_elasticsearch_aggregation_results_a(): + + target = { + "meta": {"field_name": "date_created"}, "doc_count": 7, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 7, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "WGS ONT PromethION 24 bam", "doc_count": 7} + ] + } + } + ] + } + } + ] + } + + source = { + "meta": {"field_name": "date_created"}, "doc_count": 12, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 12, + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 12} + ] + } + } + ] + } + } + ] + } + + assert merge_elasticsearch_aggregation_results(target, source) == { + "meta": {"field_name": "date_created"}, "doc_count": 19, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 19, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "WGS ONT PromethION 24 bam", "doc_count": 7} + ] + } + } + ] + }, + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 12} + ] + } + } + ] + } + } + ] + } + + +def test_normalize_elasticsearch_aggregation_results_a(): + + results = { + "meta": {"field_name": "date_created"}, "doc_count": 15, + "buckets": [ + { + "key_as_string": "2024-12", "key": 1733011200000, "doc_count": 25, + "file_sets.libraries.analytes.samples.sample_sources.cell_line.code": { + "meta": {"field_name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code"}, + "buckets": [ + { + "key": "COLO829T", "doc_count": 7, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "WGS ONT PromethION 24 bam", "doc_count": 1} + ] + } + } + ] + }, + "donors.display_title": { + "meta": {"field_name": "donors.display_title"}, + "buckets": [ + { + "key": "DAC_DONOR_COLO829", "doc_count": 12, + "release_tracker_description": { + "meta": {"field_name": "release_tracker_description"}, + "buckets": [ + {"key": "Fiber-seq PacBio Revio bam", "doc_count": 4} + ] + } + } + ] + } + } + ] + } + + assert normalize_elasticsearch_aggregation_results(results) == { + "count": 25, + "items": [ + { + "name": "date_created", + "value": "2024-12", "count": 11, + "items": [ + { + "name": "file_sets.libraries.analytes.samples.sample_sources.cell_line.code", + "value": "COLO829T", "count": 1, + "items": [ + { + "name": "release_tracker_description", + "value": "WGS ONT PromethION 24 bam", "count": 1 + } + ] + }, + { + "name": "donors.display_title", + "value": "DAC_DONOR_COLO829", "count": 4, + "items": [ + { + "name": "release_tracker_description", + "value": "Fiber-seq PacBio Revio bam", "count": 4 + } + ] + } + ] + } + ] + } diff --git a/src/encoded/tests/test_endpoint_utils.py b/src/encoded/tests/test_endpoint_utils.py new file mode 100644 index 000000000..8b30634d1 --- /dev/null +++ b/src/encoded/tests/test_endpoint_utils.py @@ -0,0 +1,79 @@ +from contextlib import contextmanager +import datetime +from typing import Optional, Union +from unittest.mock import patch as mock_patch +from encoded.endpoints.endpoint_utils import parse_date_range_related_arguments, parse_datetime_string + +DEFAULT_MOCK_DATETIME_TODAY_VALUE = "2024-11-06 07:54:16" + + +def test_parse_date_range_related_arguments_sans_from_thru_dates(): + + def testf(nmonths, include_current_month): + # Note that include_current_month used ONLY if NEITHER from_date NOR thru_date are specified (this case). + return parse_date_range_related_arguments(None, None, nmonths=nmonths, + include_current_month=include_current_month, strings=True) + + with mocked_datetime_today(DEFAULT_MOCK_DATETIME_TODAY_VALUE): + assert testf(nmonths=3, include_current_month=False) == ("2024-08-01", "2024-10-31") + assert testf(nmonths=3, include_current_month=False) == ("2024-08-01", "2024-10-31") + assert testf(nmonths=-3, include_current_month=True) == ("2024-08-01", "2024-11-30") + assert testf(nmonths=-3, include_current_month=False) == ("2024-08-01", "2024-10-31") + assert testf(nmonths=1, include_current_month=False) == ("2024-10-01", "2024-10-31") + assert testf(nmonths=1, include_current_month=True) == ("2024-10-01", "2024-11-30") + assert testf(nmonths=0, include_current_month=False) == (None, None) + assert testf(nmonths=0, include_current_month=True) == ("2024-11-01", "2024-11-30") + + +def test_parse_date_range_related_arguments_with_from_thru_dates(): + + def testf(from_date, thru_date): + # Note that include_current_month used ONLY if NEITHER from_date NOR thru_date are specified. + return parse_date_range_related_arguments(from_date, thru_date, nmonths=None, + include_current_month=None, strings=True) + + with mocked_datetime_today(DEFAULT_MOCK_DATETIME_TODAY_VALUE): + assert testf("2024-05-16", "2024-08-29") == ("2024-05-16", "2024-08-29") + assert testf("2024-08-29", "2024-05-16") == ("2024-05-16", "2024-08-29") + assert testf("2024-11-04", "2035-10-06") == ("2024-11-04", "2035-10-06") + + +def test_parse_date_range_related_arguments_with_from_date(): + + def testf(from_date, nmonths): + # Note that include_current_month used ONLY if NEITHER from_date NOR thru_date are specified. + return parse_date_range_related_arguments(from_date, None, nmonths=nmonths, + include_current_month=None, strings=True) + + with mocked_datetime_today(DEFAULT_MOCK_DATETIME_TODAY_VALUE): + assert testf("2024-06-24", nmonths=None) == ("2024-06-24", None) + assert testf("2024-06-24", nmonths=0) == ("2024-06-24", "2024-06-30") + assert testf("2024-06-24", nmonths=1) == ("2024-06-24", "2024-07-24") + + +def test_parse_date_range_related_arguments_with_thru_date(): + + def testf(thru_date, nmonths): + # Note that include_current_month used ONLY if NEITHER from_date NOR thru_date are specified. + return parse_date_range_related_arguments(None, thru_date, nmonths=nmonths, + include_current_month=None, strings=True) + + with mocked_datetime_today(DEFAULT_MOCK_DATETIME_TODAY_VALUE): + assert testf("2024-06-24", nmonths=None) == (None, "2024-06-24") + assert testf("2024-06-24", nmonths=0) == ("2024-06-01", "2024-06-24") + assert testf("2024-06-24", nmonths=-1) == ("2024-05-24", "2024-06-24") + + +@contextmanager +def mocked_datetime_today(value: Optional[Union[str, datetime.datetime]] = DEFAULT_MOCK_DATETIME_TODAY_VALUE): + if isinstance(value, str): + value = parse_datetime_string(value) + if not isinstance(value, datetime.datetime): + raise Exception("Error using mocked_datetime_today function!") + class MockDateTime(datetime.datetime): # noqa + @classmethod + def today(cls): + nonlocal value ; return value # noqa + with (mock_patch("encoded.endpoints.endpoint_utils.datetime", MockDateTime), + mock_patch("datetime.datetime", MockDateTime)): + yield