From c2009de947a57cddd6675ead7560a736430e5d80 Mon Sep 17 00:00:00 2001 From: Kamil Holubicki Date: Thu, 29 Dec 2022 14:40:37 +0100 Subject: [PATCH] Orchestrator GUI incorrectly shows recovery option for intermediate database in chained replication https://github.com/openark/orchestrator/issues/1463 Problem: If we've got replication chain A->B->C, and C is down, GUI shows 'Recover' dropdown for node B, but there is no possible recovery action available in such a case. Cause: The root cause of the problem is the analysis logic in Analysis_dao.go:GetReplicationAnalysis(). The condition for setting AllIntermediateMasterReplicasNotReplicating does not check if there are any replicas reachable. So the case when all replicas are dead (no recovery action possible) and the case when some replicas are still reachable, but are not replicating (recovery action possible) are undistingushable. Solution: Improve the analysis logic. Report AllIntermediateMasterReplicasNotReplicating only if all replicas are not replicating, but there are still some reachable replicas. This commit also contains improvement of not trying to query the node which is not reachable (ping node before examinig it) --- go/inst/analysis_dao.go | 4 ++-- go/inst/instance_dao.go | 10 +++++++++- .../01-relocate/expect_output | 6 ++++++ .../01-relocate/run | 5 +++++ .../02-down-slave-3/expect_output | 0 .../02-down-slave-3/run | 1 + .../02-down-slave-3/setup | 6 ++++++ .../intermediate-master-replica-failure/skip_run | 0 .../teardown_redeploy | 0 tests/system/test.sh | 6 +++--- 10 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 tests/system/intermediate-master-replica-failure/01-relocate/expect_output create mode 100644 tests/system/intermediate-master-replica-failure/01-relocate/run create mode 100644 tests/system/intermediate-master-replica-failure/02-down-slave-3/expect_output create mode 100644 tests/system/intermediate-master-replica-failure/02-down-slave-3/run create mode 100644 tests/system/intermediate-master-replica-failure/02-down-slave-3/setup create mode 100644 tests/system/intermediate-master-replica-failure/skip_run create mode 100644 tests/system/intermediate-master-replica-failure/teardown_redeploy diff --git a/go/inst/analysis_dao.go b/go/inst/analysis_dao.go index d352abbcd..8af2cf030 100644 --- a/go/inst/analysis_dao.go +++ b/go/inst/analysis_dao.go @@ -608,9 +608,9 @@ func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) a.Analysis = AllIntermediateMasterReplicasFailingToConnectOrDead a.Description = "Intermediate master is reachable but all of its replicas are failing to connect" // - } else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + } else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { a.Analysis = AllIntermediateMasterReplicasNotReplicating - a.Description = "Intermediate master is reachable but none of its replicas is replicating" + a.Description = "Intermediate master is reachable but none of its reachable replicas is replicating" // } else if a.IsBinlogServer && a.IsFailingToConnectToMaster { a.Analysis = BinlogServerFailingToConnectToMaster diff --git a/go/inst/instance_dao.go b/go/inst/instance_dao.go index 63324c749..349f81289 100644 --- a/go/inst/instance_dao.go +++ b/go/inst/instance_dao.go @@ -355,13 +355,21 @@ func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, bufferWrites bool, latency.Start("instance") db, err := db.OpenDiscovery(instanceKey.Hostname, instanceKey.Port) - latency.Stop("instance") if err != nil { + latency.Stop("instance") goto Cleanup } + // Even if the instance is dead, we need its key below to update + // the backend database's timestamps instance.Key = *instanceKey + err = db.Ping() + if err != nil { + goto Cleanup + } + latency.Stop("instance") + if isMaxScale, resolvedHostname, err = instance.checkMaxScale(db, latency); err != nil { // We do not "goto Cleanup" here, although it should be the correct flow. // Reason is 5.7's new security feature that requires GRANTs on performance_schema.session_variables. diff --git a/tests/system/intermediate-master-replica-failure/01-relocate/expect_output b/tests/system/intermediate-master-replica-failure/01-relocate/expect_output new file mode 100644 index 000000000..88500c3e9 --- /dev/null +++ b/tests/system/intermediate-master-replica-failure/01-relocate/expect_output @@ -0,0 +1,6 @@ +127.0.0.1:10113<127.0.0.1:10112 +127.0.0.1:10114<127.0.0.1:10113 +127.0.0.1:10111 |0s|ok ++ 127.0.0.1:10112 |0s|ok + + 127.0.0.1:10113 |0s|ok + + 127.0.0.1:10114|0s|ok diff --git a/tests/system/intermediate-master-replica-failure/01-relocate/run b/tests/system/intermediate-master-replica-failure/01-relocate/run new file mode 100644 index 000000000..de8eea3dc --- /dev/null +++ b/tests/system/intermediate-master-replica-failure/01-relocate/run @@ -0,0 +1,5 @@ +# 10111 -> 10112 -> 10113 -> 10114 +orchestrator-client -c relocate -i 127.0.0.1:10113 -d 127.0.0.1:10112 +orchestrator-client -c relocate -i 127.0.0.1:10114 -d 127.0.0.1:10113 + +orchestrator-client -c topology-tabulated -alias ci | cut -d'|' -f 1,2,3 diff --git a/tests/system/intermediate-master-replica-failure/02-down-slave-3/expect_output b/tests/system/intermediate-master-replica-failure/02-down-slave-3/expect_output new file mode 100644 index 000000000..e69de29bb diff --git a/tests/system/intermediate-master-replica-failure/02-down-slave-3/run b/tests/system/intermediate-master-replica-failure/02-down-slave-3/run new file mode 100644 index 000000000..d4bffb9fc --- /dev/null +++ b/tests/system/intermediate-master-replica-failure/02-down-slave-3/run @@ -0,0 +1 @@ +orchestrator-client -c replication-analysis diff --git a/tests/system/intermediate-master-replica-failure/02-down-slave-3/setup b/tests/system/intermediate-master-replica-failure/02-down-slave-3/setup new file mode 100644 index 000000000..c3257c83a --- /dev/null +++ b/tests/system/intermediate-master-replica-failure/02-down-slave-3/setup @@ -0,0 +1,6 @@ +#!/bin/bash + +set -e + +mysqladmin -uci -pci -h 127.0.0.1 --port=10114 shutdown +sleep 20 diff --git a/tests/system/intermediate-master-replica-failure/skip_run b/tests/system/intermediate-master-replica-failure/skip_run new file mode 100644 index 000000000..e69de29bb diff --git a/tests/system/intermediate-master-replica-failure/teardown_redeploy b/tests/system/intermediate-master-replica-failure/teardown_redeploy new file mode 100644 index 000000000..e69de29bb diff --git a/tests/system/test.sh b/tests/system/test.sh index a1498c2b6..81585def6 100755 --- a/tests/system/test.sh +++ b/tests/system/test.sh @@ -187,7 +187,7 @@ test_single() { fi # test steps: - find "$tests_path/$test_name" ! -path . -type d -mindepth 1 -maxdepth 1 | sort | cut -d "/" -f 5 | while read test_step_name ; do + find "$tests_path/$test_name" -mindepth 1 -maxdepth 1 ! -path . -type d | sort | cut -d "/" -f 5 | while read test_step_name ; do [ "$test_step_name" == "." ] && continue test_step "$tests_path/$test_name/$test_step_name" "$test_name" "$test_step_name" if [ $? -ne 0 ] ; then @@ -301,7 +301,7 @@ test_all() { while [ -s $tests_todo_file ] ; do echo -n > $tests_todo_file - find $tests_path ! -path . -type d -mindepth 1 -maxdepth 1 | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do + find $tests_path -mindepth 1 -maxdepth 1 ! -path . -type d | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do if ! test_listed_as_attempted "$test_name" ; then echo "$test_name" >> $tests_todo_file fi @@ -318,7 +318,7 @@ test_all() { fi done || return 1 done - find $tests_path ! -path . -type d -mindepth 1 -maxdepth 1 | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do + find $tests_path -mindepth 1 -maxdepth 1 ! -path . -type d | xargs ls -td1 | cut -d "/" -f 4 | egrep "$test_pattern" | while read test_name ; do if ! test_listed_as_attempted "$test_name" ; then echo "# ERROR: tests completed by $test_name seems to have been skipped" exit 1