From c14529b65fde3c0bf4492ddbb1a09d2db4781a33 Mon Sep 17 00:00:00 2001
From: Kartik Ganesh <gkart@amazon.com>
Date: Thu, 14 Dec 2023 15:29:28 -0800
Subject: [PATCH] Add track_total_hits parameter to doc-count operation (#474)

The track_total_hits parameter forces an accurate doc-count even when the hit count for _search is above the default 10K limit. This resolves a bug where the total doc count was not being computed properly. Unit tests have also been updated to match the new path.

Signed-off-by: Kartik Ganesh <gkart@amazon.com>
---
 FetchMigration/python/index_operations.py            | 6 ++++--
 FetchMigration/python/tests/test_index_operations.py | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/FetchMigration/python/index_operations.py b/FetchMigration/python/index_operations.py
index 9c3ed54ea..03f1f11d1 100644
--- a/FetchMigration/python/index_operations.py
+++ b/FetchMigration/python/index_operations.py
@@ -22,8 +22,10 @@
 COUNT_KEY = "count"
 __INDEX_KEY = "index"
 __ALL_INDICES_ENDPOINT = "*"
-__SEARCH_COUNT_PATH = "/_search?size=0"
-__SEARCH_COUNT_PAYLOAD = {"aggs": {"count": {"terms": {"field": "_index"}}}}
+# (ES 7+) size=0 avoids the "hits" payload to reduce the response size since we're only interested in the aggregation,
+# and track_total_hits forces an accurate doc-count
+__SEARCH_COUNT_PATH = "/_search"
+__SEARCH_COUNT_PAYLOAD = {"size": 0, "track_total_hits": True, "aggs": {"count": {"terms": {"field": "_index"}}}}
 __TOTAL_COUNT_JSONPATH = jsonpath_ng.parse("$.hits.total.value")
 __INDEX_COUNT_JSONPATH = jsonpath_ng.parse("$.aggregations.count.buckets")
 __BUCKET_INDEX_NAME_KEY = "key"
diff --git a/FetchMigration/python/tests/test_index_operations.py b/FetchMigration/python/tests/test_index_operations.py
index 8a13836ed..06f9502cb 100644
--- a/FetchMigration/python/tests/test_index_operations.py
+++ b/FetchMigration/python/tests/test_index_operations.py
@@ -99,7 +99,7 @@ def test_doc_count(self):
         for index_name in test_indices:
             test_buckets.append({"key": index_name, "doc_count": index_doc_count})
         total_docs: int = index_doc_count * len(test_buckets)
-        expected_count_endpoint = test_constants.SOURCE_ENDPOINT + ",".join(test_indices) + "/_search?size=0"
+        expected_count_endpoint = test_constants.SOURCE_ENDPOINT + ",".join(test_indices) + "/_search"
         mock_count_response = {"hits": {"total": {"value": total_docs}},
                                "aggregations": {"count": {"buckets": test_buckets}}}
         responses.get(expected_count_endpoint, json=mock_count_response)
@@ -110,7 +110,7 @@ def test_doc_count(self):
     @responses.activate
     def test_doc_count_error(self):
         test_indices = {test_constants.INDEX1_NAME, test_constants.INDEX2_NAME}
-        expected_count_endpoint = test_constants.SOURCE_ENDPOINT + ",".join(test_indices) + "/_search?size=0"
+        expected_count_endpoint = test_constants.SOURCE_ENDPOINT + ",".join(test_indices) + "/_search"
         responses.get(expected_count_endpoint, body=requests.Timeout())
         self.assertRaises(RuntimeError, index_operations.doc_count, test_indices,
                           EndpointInfo(test_constants.SOURCE_ENDPOINT))