From c14529b65fde3c0bf4492ddbb1a09d2db4781a33 Mon Sep 17 00:00:00 2001 From: Kartik Ganesh Date: Thu, 14 Dec 2023 15:29:28 -0800 Subject: [PATCH] Add track_total_hits parameter to doc-count operation (#474) The track_total_hits parameter forces an accurate doc-count even when the hit count for _search is above the default 10K limit. This resolves a bug where the total doc count was not being computed properly. Unit tests have also been updated to match the new path. Signed-off-by: Kartik Ganesh --- FetchMigration/python/index_operations.py | 6 ++++-- FetchMigration/python/tests/test_index_operations.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/FetchMigration/python/index_operations.py b/FetchMigration/python/index_operations.py index 9c3ed54ea..03f1f11d1 100644 --- a/FetchMigration/python/index_operations.py +++ b/FetchMigration/python/index_operations.py @@ -22,8 +22,10 @@ COUNT_KEY = "count" __INDEX_KEY = "index" __ALL_INDICES_ENDPOINT = "*" -__SEARCH_COUNT_PATH = "/_search?size=0" -__SEARCH_COUNT_PAYLOAD = {"aggs": {"count": {"terms": {"field": "_index"}}}} +# (ES 7+) size=0 avoids the "hits" payload to reduce the response size since we're only interested in the aggregation, +# and track_total_hits forces an accurate doc-count +__SEARCH_COUNT_PATH = "/_search" +__SEARCH_COUNT_PAYLOAD = {"size": 0, "track_total_hits": True, "aggs": {"count": {"terms": {"field": "_index"}}}} __TOTAL_COUNT_JSONPATH = jsonpath_ng.parse("$.hits.total.value") __INDEX_COUNT_JSONPATH = jsonpath_ng.parse("$.aggregations.count.buckets") __BUCKET_INDEX_NAME_KEY = "key" diff --git a/FetchMigration/python/tests/test_index_operations.py b/FetchMigration/python/tests/test_index_operations.py index 8a13836ed..06f9502cb 100644 --- a/FetchMigration/python/tests/test_index_operations.py +++ b/FetchMigration/python/tests/test_index_operations.py @@ -99,7 +99,7 @@ def test_doc_count(self): for index_name in test_indices: test_buckets.append({"key": index_name, "doc_count": index_doc_count}) total_docs: int = index_doc_count * len(test_buckets) - expected_count_endpoint = test_constants.SOURCE_ENDPOINT + ",".join(test_indices) + "/_search?size=0" + expected_count_endpoint = test_constants.SOURCE_ENDPOINT + ",".join(test_indices) + "/_search" mock_count_response = {"hits": {"total": {"value": total_docs}}, "aggregations": {"count": {"buckets": test_buckets}}} responses.get(expected_count_endpoint, json=mock_count_response) @@ -110,7 +110,7 @@ def test_doc_count(self): @responses.activate def test_doc_count_error(self): test_indices = {test_constants.INDEX1_NAME, test_constants.INDEX2_NAME} - expected_count_endpoint = test_constants.SOURCE_ENDPOINT + ",".join(test_indices) + "/_search?size=0" + expected_count_endpoint = test_constants.SOURCE_ENDPOINT + ",".join(test_indices) + "/_search" responses.get(expected_count_endpoint, body=requests.Timeout()) self.assertRaises(RuntimeError, index_operations.doc_count, test_indices, EndpointInfo(test_constants.SOURCE_ENDPOINT))