From b723fd6d4197a520cbe1db9a29fcb84e7b293cc5 Mon Sep 17 00:00:00 2001 From: Daniel Mitterdorfer Date: Tue, 9 Mar 2021 13:30:36 +0100 Subject: [PATCH] Log results of cluster health check When the cluster health check fails e.g. due to invalid setup where we check for a green cluster on a single node but indices have been configured with replicas, the benchmark appears to hang indefinitely. The reason is that the cluster health check is retried by default but there is no indication in the logs. With this commit we log the result of the cluster health check. We also raise the log level of retry log messages so users can analyze the behavior by inspecting logs. Closes #1150 --- esrally/driver/runner.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/esrally/driver/runner.py b/esrally/driver/runner.py index 1a5e7ff73..811b4db99 100644 --- a/esrally/driver/runner.py +++ b/esrally/driver/runner.py @@ -967,13 +967,16 @@ def status(v): cluster_status = result["status"] relocating_shards = result["relocating_shards"] - return { + result = { "weight": 1, "unit": "ops", "success": status(cluster_status) >= status(expected_cluster_status) and relocating_shards <= expected_relocating_shards, "cluster-status": cluster_status, "relocating-shards": relocating_shards } + self.logger.info("%s: expected status=[%s], actual status=[%s], relocating shards=[%d], success=[%s].", + repr(self), expected_cluster_status, cluster_status, relocating_shards, result["success"]) + return result def __repr__(self, *args, **kwargs): return "cluster-health" @@ -2208,7 +2211,8 @@ async def __call__(self, es, params): self.logger.debug("%s has returned successfully", repr(self.delegate)) return return_value else: - self.logger.debug("%s has returned with an error: %s.", repr(self.delegate), return_value) + self.logger.info("[%s] has returned with an error: %s. Retrying in [%.2f] seconds.", + repr(self.delegate), return_value, sleep_time) await asyncio.sleep(sleep_time) else: return return_value @@ -2221,7 +2225,7 @@ async def __call__(self, es, params): if last_attempt or not retry_on_timeout: raise e elif e.status_code == 408: - self.logger.debug("%s has timed out.", repr(self.delegate)) + self.logger.info("[%s] has timed out. Retrying in [%.2f] seconds.", repr(self.delegate), sleep_time) await asyncio.sleep(sleep_time) else: raise e