From 2ac6f63ba2e5f95b2997a6b195ce420bc43a72a3 Mon Sep 17 00:00:00 2001 From: Quan Zhou Date: Thu, 10 Oct 2024 20:04:46 +0200 Subject: [PATCH] [enriched/githubql] Fix enrich_reference_analysis study This code fixes the `enrich_reference_analysis` study that only processed the first 10 references instead of all of them. By default, the ElasticSearch/OpenSearch `aggregations` query only returns the first 10 documents. Using `composite aggregations` we can paginate the result to get all the references. Signed-off-by: Quan Zhou --- grimoire_elk/enriched/githubql.py | 137 ++++++++++-------- ..._analysis-study-process-all-references.yml | 11 ++ 2 files changed, 90 insertions(+), 58 deletions(-) create mode 100644 releases/unreleased/the-enrich_reference_analysis-study-process-all-references.yml diff --git a/grimoire_elk/enriched/githubql.py b/grimoire_elk/enriched/githubql.py index c6545c492..03b3fb675 100644 --- a/grimoire_elk/enriched/githubql.py +++ b/grimoire_elk/enriched/githubql.py @@ -593,6 +593,84 @@ def _get_merged_prs(es_input): return merged_prs_list + def _get_cross_references(es_input, index): + # Get all CrossReferencedEvent items and their referenced issues and pull requests + es_query = { + "size": 0, + "track_total_hits": True, + "query": { + "bool": { + "must": { + "term": { + "event_type": "CrossReferencedEvent" + } + } + } + }, + "aggs": { + "composite_issue_url": { + "composite": { + "sources": [{ + "issue_url": { + "terms": { + "field": "issue_url" + } + } + }], + "size": 1000 + }, + "aggs": { + "references_urls": { + "terms": { + "field": "reference_source_url", + "size": 10000 + } + } + } + } + } + } + + buckets = [] + while True: + cross_references = es_input.search(index=index, body=es_query) + buckets += cross_references['aggregations']['composite_issue_url']['buckets'] + after_key = cross_references['aggregations']['composite_issue_url'].get('after_key', None) + if not after_key: + break + es_query['aggs']['composite_issue_url']['composite']['after'] = after_key + + reference_dict = {} + for item in buckets: + issue_url = item['key']['issue_url'] + references = [ref['key'] for ref in item['references_urls']['buckets']] + + # Update reference dictionary + if issue_url not in reference_dict.keys(): + reference_dict[issue_url] = references + else: + prev_references = reference_dict[issue_url] + prev_references.append(references) + reference_dict[issue_url] = list(set(prev_references)) + + # Adding list entries from reversed references + for issue_url in reference_dict.keys(): + reference_list = reference_dict[issue_url] + if not reference_list: + continue + for ref in reference_list: + try: + ref_entry_list = reference_dict[ref] + except KeyError: + continue + if ref_entry_list: + ref_entry_list.append(issue_url) + else: + ref_entry_list = [issue_url] + reference_dict[ref] = list(set(ref_entry_list)) + + return reference_dict + data_source = enrich_backend.__class__.__name__.split("Enrich")[0].lower() log_prefix = "[{}] Cross reference analysis".format(data_source) logger.info("{} starting study {}".format(log_prefix, anonymize_url(self.elastic.index_url))) @@ -605,64 +683,7 @@ def _get_merged_prs(es_input): logger.info("{} Retrieving the merged PRs from MergeEvents".format(log_prefix)) merged_prs = _get_merged_prs(es_in) - # Get all CrossReferencedEvent items and their referenced issues and pull requests - es_query = { - "size": 0, - "query": { - "bool": { - "must": { - "term": { - "event_type": "CrossReferencedEvent" - } - } - } - }, - "aggs": { - "issue_url": { - "terms": { - "field": "issue_url", - "size": 30000 - }, - "aggs": { - "uniq_gender": { - "terms": {"field": "reference_source_url"} - } - } - } - } - } - - cross_references = es_in.search(index=in_index, body=es_query) - buckets = cross_references['aggregations']['issue_url']['buckets'] - - reference_dict = {} - for item in buckets: - issue_url = item['key'] - references = [ref['key'] for ref in item['uniq_gender']['buckets']] - - # Update reference dictionary - if issue_url not in reference_dict.keys(): - reference_dict[issue_url] = references - else: - prev_references = reference_dict[issue_url] - prev_references.append(references) - reference_dict[issue_url] = list(set(prev_references)) - - # Adding list entries from reversed references - for issue_url in reference_dict.keys(): - reference_list = reference_dict[issue_url] - if not reference_list: - continue - for ref in reference_list: - try: - ref_entry_list = reference_dict[ref] - except KeyError: - continue - if ref_entry_list: - ref_entry_list.append(issue_url) - else: - ref_entry_list = [issue_url] - reference_dict[ref] = list(set(ref_entry_list)) + reference_dict = _get_cross_references(es_in, in_index) # Updated affected issues and pull requests painless_code = """ diff --git a/releases/unreleased/the-enrich_reference_analysis-study-process-all-references.yml b/releases/unreleased/the-enrich_reference_analysis-study-process-all-references.yml new file mode 100644 index 000000000..972ecaf77 --- /dev/null +++ b/releases/unreleased/the-enrich_reference_analysis-study-process-all-references.yml @@ -0,0 +1,11 @@ +--- +title: The enrich_reference_analysis study process all references +category: fixed +author: Quan Zhou +issue: null +notes: > + The `enrich_reference_analysis` study analyzes the cross-reference + between "issues" and "pull request". When we use an aggregations query, + it returns only the first 10 items (ElasticSearch/OpenSearch by default). + By using 'composite aggregations', we can paginate the result and thus + obtain all the references.