From 64420d8874fb249b240e10841afa30f4e26b93a2 Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Tue, 17 Oct 2023 13:41:20 +0800 Subject: [PATCH 01/10] fix: No Namenodes available when use observer --- .../resolver/MembershipNamenodeResolver.java | 24 ++- .../federation/router/RouterRpcClient.java | 40 ++-- .../federation/MiniRouterDFSCluster.java | 4 +- .../TestNoNamenodesAvailableLongTime.java | 190 ++++++++++++++++++ 4 files changed, 237 insertions(+), 21 deletions(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java index c0e800e0430d4..3de331a642780 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java @@ -491,10 +491,22 @@ public void setRouterId(String router) { public void rotateCache( String nsId, FederationNamenodeContext namenode, boolean listObserversFirst) { cacheNS.compute(Pair.of(nsId, listObserversFirst), (ns, namenodeContexts) -> { - if (namenodeContexts == null || namenodeContexts.size() <= 1) { + if (namenodeContexts == null) { return namenodeContexts; } - FederationNamenodeContext firstNamenodeContext = namenodeContexts.get(0); + List nonOberverNnContexts = new ArrayList<>(); + List oberverNnContexts = new ArrayList<>(); + for (FederationNamenodeContext namenodeContext : namenodeContexts) { + if (namenodeContext.getState() == OBSERVER) { + oberverNnContexts.add(namenodeContext); + }else { + nonOberverNnContexts.add(namenodeContext); + } + } + if (nonOberverNnContexts.size() <= 1) { + return namenodeContexts; + } + FederationNamenodeContext firstNamenodeContext = nonOberverNnContexts.get(0); /* * If the first nn in the cache is active, the active nn priority cannot be lowered. * This happens when other threads have already updated the cache. @@ -508,13 +520,13 @@ public void rotateCache( * This happens when other threads have already rotated the cache. */ if (firstNamenodeContext.getRpcAddress().equals(namenode.getRpcAddress())) { - List rotatedNnContexts = new ArrayList<>(namenodeContexts); - Collections.rotate(rotatedNnContexts, -1); - String firstNamenodeId = namenodeContexts.get(0).getNamenodeId(); + Collections.rotate(nonOberverNnContexts, -1); + String firstNamenodeId = nonOberverNnContexts.get(0).getNamenodeId(); LOG.info("Rotate cache of pair , put namenode: {} in the " + "first position of the cache and namenode: {} in the last position of the cache", nsId, listObserversFirst, firstNamenodeId, namenode.getNamenodeId()); - return rotatedNnContexts; + oberverNnContexts.addAll(nonOberverNnContexts); + return oberverNnContexts; } return namenodeContexts; }); diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java index b38900c3bc264..99cf0d2ed0878 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java @@ -461,10 +461,12 @@ private static IOException toIOException(Exception e) { * @throws NoNamenodesAvailableException Exception that the retry policy * generates for no available namenodes. */ - private RetryDecision shouldRetry(final IOException ioe, final int retryCount, - final String nsId) throws IOException { + private RetryDecision shouldRetry( + final IOException ioe, final int retryCount, final String nsId, + final FederationNamenodeContext namenode, + final boolean listObserverFirst) throws IOException { // check for the case of cluster unavailable state - if (isClusterUnAvailable(nsId)) { + if (isClusterUnAvailable(nsId, namenode, listObserverFirst)) { // we allow to retry once if cluster is unavailable if (retryCount == 0) { return RetryDecision.RETRY; @@ -538,7 +540,7 @@ public Object invokeMethod( ProxyAndInfo client = connection.getClient(); final Object proxy = client.getProxy(); - ret = invoke(nsId, 0, method, proxy, params); + ret = invoke(nsId, namenode, useObserver, 0, method, proxy, params); if (failover && FederationNamenodeServiceState.OBSERVER != namenode.getState()) { // Success on alternate server, update @@ -599,8 +601,11 @@ public Object invokeMethod( } LOG.error("Cannot get available namenode for {} {} error: {}", nsId, rpcAddress, ioe.getMessage()); - // Rotate cache so that client can retry the next namenode in the cache - this.namenodeResolver.rotateCache(nsId, namenode, shouldUseObserver); + // Only if the namenode is unavailable, + // rotate cache so that client can retry the next namenode in the cache + if (isUnavailableException(ioe.getCause())) { + this.namenodeResolver.rotateCache(nsId, namenode, useObserver); + } // Throw RetriableException so that client can retry throw new RetriableException(ioe); } else { @@ -708,7 +713,9 @@ private void addClientInfoToCallerContext(UserGroupInformation ugi) { * @return Response from the remote server * @throws IOException If error occurs. */ - private Object invoke(String nsId, int retryCount, final Method method, + private Object invoke( + String nsId, FederationNamenodeContext namenode, Boolean listObserverFirst, + int retryCount, final Method method, final Object obj, final Object... params) throws IOException { try { return method.invoke(obj, params); @@ -721,14 +728,14 @@ private Object invoke(String nsId, int retryCount, final Method method, IOException ioe = (IOException) cause; // Check if we should retry. - RetryDecision decision = shouldRetry(ioe, retryCount, nsId); + RetryDecision decision = shouldRetry(ioe, retryCount, nsId, namenode, listObserverFirst); if (decision == RetryDecision.RETRY) { if (this.rpcMonitor != null) { this.rpcMonitor.proxyOpRetries(); } // retry - return invoke(nsId, ++retryCount, method, obj, params); + return invoke(nsId, namenode, listObserverFirst, ++retryCount, method, obj, params); } else if (decision == RetryDecision.FAILOVER_AND_RETRY) { // failover, invoker looks for standby exceptions for failover. if (ioe instanceof StandbyException) { @@ -752,7 +759,7 @@ private Object invoke(String nsId, int retryCount, final Method method, * @param ioe IOException to check. * @return If the exception comes from an unavailable subcluster. */ - public static boolean isUnavailableException(IOException ioe) { + public static boolean isUnavailableException(Throwable ioe) { if (ioe instanceof ConnectTimeoutException || ioe instanceof EOFException || ioe instanceof SocketException || @@ -772,13 +779,20 @@ public static boolean isUnavailableException(IOException ioe) { * Check if the cluster of given nameservice id is available. * * @param nsId nameservice ID. + * @param namenode + * @param listObserverFirst * @return true if the cluster with given nameservice id is available. * @throws IOException if error occurs. */ - private boolean isClusterUnAvailable(String nsId) throws IOException { - List nnState = this.namenodeResolver - .getNamenodesForNameserviceId(nsId, false); + private boolean isClusterUnAvailable( + String nsId, FederationNamenodeContext namenode, + boolean listObserverFirst) throws IOException { + if (listObserverFirst && namenode.getState() == FederationNamenodeServiceState.OBSERVER) { + return false; + } + List nnState = this.namenodeResolver + .getNamenodesForNameserviceId(nsId, listObserverFirst); if (nnState != null) { for (FederationNamenodeContext nnContext : nnState) { // Once we find one NN is in active state, we assume this diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/MiniRouterDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/MiniRouterDFSCluster.java index bf22cf01148a3..4a023b3c4e035 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/MiniRouterDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/MiniRouterDFSCluster.java @@ -132,9 +132,9 @@ public class MiniRouterDFSCluster { /** Mini cluster. */ private MiniDFSCluster cluster; - protected static final long DEFAULT_HEARTBEAT_INTERVAL_MS = + public static final long DEFAULT_HEARTBEAT_INTERVAL_MS = TimeUnit.SECONDS.toMillis(5); - protected static final long DEFAULT_CACHE_INTERVAL_MS = + public static final long DEFAULT_CACHE_INTERVAL_MS = TimeUnit.SECONDS.toMillis(5); /** Heartbeat interval in milliseconds. */ private long heartbeatInterval; diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java new file mode 100644 index 0000000000000..9b4b7588e3a1c --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -0,0 +1,190 @@ +package org.apache.hadoop.hdfs.server.federation.router; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DFSClient; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster; +import org.apache.hadoop.hdfs.server.federation.RouterConfigBuilder; +import org.apache.hadoop.hdfs.server.federation.StateStoreDFSCluster; +import org.apache.hadoop.hdfs.server.federation.metrics.FederationRPCMetrics; +import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeContext; +import org.apache.hadoop.util.Time; +import org.junit.After; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + + + +import java.net.URI; +import java.util.Collection; +import java.util.List; + +import static org.apache.hadoop.ha.HAServiceProtocol.HAServiceState.ACTIVE; +import static org.apache.hadoop.ha.HAServiceProtocol.HAServiceState.STANDBY; +import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.transitionClusterNSToStandby;; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class TestNoNamenodesAvailableLongTime { + + private StateStoreDFSCluster cluster; + + @After + public void cleanup() { + if (cluster != null) { + cluster.shutdown(); + cluster = null; + } + } + + @Rule + public ExpectedException exceptionRule = ExpectedException.none(); + + private void setupCluster(int numNameservices, int numberOfObserver) + throws Exception { + int numberOfNamenode = 2 + numberOfObserver; + cluster = new StateStoreDFSCluster(true, numNameservices, numberOfNamenode); + Configuration routerConf = new RouterConfigBuilder() + .stateStore() + .metrics() + .admin() + .rpc() + .heartbeat() + .build(); + + routerConf.setBoolean(RBFConfigKeys.DFS_ROUTER_OBSERVER_READ_DEFAULT_KEY, true); + routerConf.setBoolean(DFSConfigKeys.DFS_HA_TAILEDITS_INPROGRESS_KEY, true); + routerConf.set(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, "0ms"); + + // Reduce the number of RPC clients threads to overload the Router easy + routerConf.setInt(RBFConfigKeys.DFS_ROUTER_CLIENT_THREADS_SIZE, 4); + // Overload control + routerConf.setBoolean( + RBFConfigKeys.DFS_ROUTER_CLIENT_REJECT_OVERLOAD, false); + + // No need for datanodes as we use renewLease() for testing + cluster.setNumDatanodesPerNameservice(0); + // No need for datanodes as we use renewLease() for testing + cluster.setNumDatanodesPerNameservice(0); + cluster.addRouterOverrides(routerConf); + + + cluster.startCluster(); +// // Making one Namenode active per nameservice +// if (cluster.isHighAvailability()) { +// for (String ns : cluster.getNameservices()) { +// cluster.switchToActive(ns, NAMENODES[0]); +// cluster.switchToStandby(ns, NAMENODES[1]); +// for (int i = 2; i < numberOfNamenode; i++) { +// cluster.switchToObserver(ns, NAMENODES[i]); +// } +// } +// } + + cluster.startRouters(); + cluster.waitClusterUp(); + } + + /** + * When failover occurs, the router may record that the ns has no active namenode. + * Only when the router updates the cache next time can the memory status be updated, + * causing the router to report NoNamenodesAvailableException for a long time. + */ + @Test + public void testNoNamenodesAvailableLongTimeWhenNsFailover() throws Exception { + setupCluster(1, 2); + transitionClusterNSToStandby(cluster); + for (MiniRouterDFSCluster.RouterContext routerContext : cluster.getRouters()) { + // Manually trigger the heartbeat + Collection heartbeatServices = routerContext + .getRouter().getNamenodeHeartbeatServices(); + for (NamenodeHeartbeatService service : heartbeatServices) { + service.periodicInvoke(); + } + // Update service cache + routerContext.getRouter().getStateStore().refreshCaches(true); + } + // Record the time after the router first updated the cache + long firstLoadTime = Time.now(); + List namenodes = cluster.getNamenodes(); + + // Make sure all namenodes are in standby state + for (MiniRouterDFSCluster.NamenodeContext namenodeContext : namenodes) { + assertEquals(STANDBY.ordinal(), namenodeContext.getNamenode().getNameNodeState()); + } + + Configuration conf = cluster.getRouterClientConf(); + // Set dfs.client.failover.random.order false, to pick 1st router at first + conf.setBoolean("dfs.client.failover.random.order", false); + + DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf); + + for (MiniRouterDFSCluster.RouterContext routerContext : cluster.getRouters()) { + // Get the second namenode in the router cache and make it active + List ns0 = routerContext.getRouter() + .getNamenodeResolver() + .getNamenodesForNameserviceId("ns0", false); + + String nsId = ns0.get(1).getNamenodeId(); + cluster.switchToActive("ns0", nsId); + // Manually trigger the heartbeat, but the router does not manually load the cache + Collection heartbeatServices = routerContext + .getRouter().getNamenodeHeartbeatServices(); + for (NamenodeHeartbeatService service : heartbeatServices) { + service.periodicInvoke(); + } + assertEquals(ACTIVE.ordinal(), + cluster.getNamenode("ns0", nsId).getNamenode().getNameNodeState()); + } + + // Get router0 metrics + FederationRPCMetrics rpcMetrics0 = cluster.getRouters().get(0) + .getRouter().getRpcServer().getRPCMetrics(); + // Original failures + long originalRouter0Failures = rpcMetrics0.getProxyOpNoNamenodes(); + + /* + * At this time, the router has recorded 2 standby namenodes in memory, + * and the first accessed namenode is indeed standby, + * then an NoNamenodesAvailableException will be reported for the first access, + * and the next access will be successful. + */ + routerClient.getFileInfo("/"); + long successReadTime = Time.now(); + assertEquals(originalRouter0Failures + 1, rpcMetrics0.getProxyOpNoNamenodes()); + + /* + * access the active namenode without waiting for the router to update the cache, + * even if there are 2 standby states recorded in the router memory. + */ + assertTrue(successReadTime - firstLoadTime < cluster.getCacheFlushInterval()); + } + + @Test + public void testOnn() throws Exception { + setupCluster(1, 2); + MiniRouterDFSCluster.RouterContext router = cluster.getRandomRouter(); + + for (MiniRouterDFSCluster.RouterContext routerContext : cluster.getRouters()) { + // Manually trigger the heartbeat + Collection heartbeatServices = routerContext + .getRouter().getNamenodeHeartbeatServices(); + for (NamenodeHeartbeatService service : heartbeatServices) { + service.periodicInvoke(); + } + // Update service cache + routerContext.getRouter().getStateStore().refreshCaches(true); + } + Path path = new Path("/testFile"); + FileSystem fileSystem = router.getFileSystem(); + // Send Create call to active + fileSystem.create(path).close(); + + // Send read request to observer. The router will msync to the active namenode. + fileSystem.open(path).close(); + } + +} From 5388d943a43e64cff59558cc0318cf812633562b Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Tue, 17 Oct 2023 23:05:32 +0800 Subject: [PATCH 02/10] fix: No Namenodes available when use observer --- hadoop-client-modules/pom.xml | 4 +- .../federation/router/RouterRpcClient.java | 21 +- .../TestNoNamenodesAvailableLongTime.java | 204 ++++++++++++------ 3 files changed, 156 insertions(+), 73 deletions(-) diff --git a/hadoop-client-modules/pom.xml b/hadoop-client-modules/pom.xml index fb4aedb0aeb43..907fc1de14674 100644 --- a/hadoop-client-modules/pom.xml +++ b/hadoop-client-modules/pom.xml @@ -37,9 +37,9 @@ hadoop-client-minicluster hadoop-client-check-invariants - hadoop-client-check-test-invariants + - hadoop-client-integration-tests + diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java index 99cf0d2ed0878..67b40a9977e68 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java @@ -596,14 +596,14 @@ public Object invokeMethod( se.initCause(ioe); throw se; } else if (ioe instanceof NoNamenodesAvailableException) { + IOException cause = (IOException) ioe.getCause(); if (this.rpcMonitor != null) { this.rpcMonitor.proxyOpNoNamenodes(nsId); } LOG.error("Cannot get available namenode for {} {} error: {}", nsId, rpcAddress, ioe.getMessage()); - // Only if the namenode is unavailable, - // rotate cache so that client can retry the next namenode in the cache - if (isUnavailableException(ioe.getCause())) { + // Rotate cache so that client can retry the next namenode in the cache + if (shouldRotateCache(cause)) { this.namenodeResolver.rotateCache(nsId, namenode, useObserver); } // Throw RetriableException so that client can retry @@ -759,7 +759,7 @@ private Object invoke( * @param ioe IOException to check. * @return If the exception comes from an unavailable subcluster. */ - public static boolean isUnavailableException(Throwable ioe) { + public static boolean isUnavailableException(IOException ioe) { if (ioe instanceof ConnectTimeoutException || ioe instanceof EOFException || ioe instanceof SocketException || @@ -1844,4 +1844,17 @@ private LongAccumulator getTimeOfLastCallToActive(String namespaceId) { return lastActiveNNRefreshTimes .computeIfAbsent(namespaceId, key -> new LongAccumulator(Math::max, 0)); } + + // todo javadoc + private boolean shouldRotateCache(IOException ioe) { + if (isUnavailableException(ioe)) { + return true; + } + if (ioe instanceof RemoteException) { + RemoteException re = (RemoteException) ioe; + ioe = re.unwrapRemoteException(); + ioe = getCleanException(ioe); + } + return isUnavailableException(ioe); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java index 9b4b7588e3a1c..e69ae5c4d2e8a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -1,15 +1,25 @@ package org.apache.hadoop.hdfs.server.federation.router; +import static org.apache.hadoop.fs.permission.AclEntryType.USER; +import static org.apache.hadoop.fs.permission.FsAction.ALL; +import static org.apache.hadoop.fs.permission.AclEntryScope.DEFAULT; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.AclEntry; +import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.protocol.AclException; import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster; +import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster.RouterContext; import org.apache.hadoop.hdfs.server.federation.RouterConfigBuilder; import org.apache.hadoop.hdfs.server.federation.StateStoreDFSCluster; import org.apache.hadoop.hdfs.server.federation.metrics.FederationRPCMetrics; import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeContext; +import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeServiceState; +import org.apache.hadoop.ipc.RemoteException; +import org.apache.hadoop.util.Lists; import org.apache.hadoop.util.Time; import org.junit.After; import org.junit.Rule; @@ -17,19 +27,28 @@ import org.junit.rules.ExpectedException; - +import java.io.IOException; import java.net.URI; import java.util.Collection; import java.util.List; +import java.util.concurrent.TimeUnit; import static org.apache.hadoop.ha.HAServiceProtocol.HAServiceState.ACTIVE; import static org.apache.hadoop.ha.HAServiceProtocol.HAServiceState.STANDBY; import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.transitionClusterNSToStandby;; +import static org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster.DEFAULT_HEARTBEAT_INTERVAL_MS; +import static org.apache.hadoop.hdfs.server.namenode.AclTestHelpers.aclEntry; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +/** + * When failover occurs, the router may record that the ns has no active namenode. + * Only when the router updates the cache next time can the memory status be updated, + * causing the router to report NoNamenodesAvailableException for a long time. + */ public class TestNoNamenodesAvailableLongTime { + private StateStoreDFSCluster cluster; @After @@ -43,10 +62,14 @@ public void cleanup() { @Rule public ExpectedException exceptionRule = ExpectedException.none(); - private void setupCluster(int numNameservices, int numberOfObserver) + private void setupCluster(int numNameservices, int numberOfObserver, boolean useObserver) throws Exception { + if (!useObserver) { + numberOfObserver = 0; + } int numberOfNamenode = 2 + numberOfObserver; - cluster = new StateStoreDFSCluster(true, numNameservices, numberOfNamenode); + cluster = new StateStoreDFSCluster(true, numNameservices, numberOfNamenode, + DEFAULT_HEARTBEAT_INTERVAL_MS, 10000); Configuration routerConf = new RouterConfigBuilder() .stateStore() .metrics() @@ -55,9 +78,11 @@ private void setupCluster(int numNameservices, int numberOfObserver) .heartbeat() .build(); - routerConf.setBoolean(RBFConfigKeys.DFS_ROUTER_OBSERVER_READ_DEFAULT_KEY, true); - routerConf.setBoolean(DFSConfigKeys.DFS_HA_TAILEDITS_INPROGRESS_KEY, true); - routerConf.set(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, "0ms"); + if (useObserver) { + routerConf.setBoolean(RBFConfigKeys.DFS_ROUTER_OBSERVER_READ_DEFAULT_KEY, true); + routerConf.setBoolean(DFSConfigKeys.DFS_HA_TAILEDITS_INPROGRESS_KEY, true); + routerConf.set(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, "0ms"); + } // Reduce the number of RPC clients threads to overload the Router easy routerConf.setInt(RBFConfigKeys.DFS_ROUTER_CLIENT_THREADS_SIZE, 4); @@ -65,48 +90,36 @@ private void setupCluster(int numNameservices, int numberOfObserver) routerConf.setBoolean( RBFConfigKeys.DFS_ROUTER_CLIENT_REJECT_OVERLOAD, false); - // No need for datanodes as we use renewLease() for testing - cluster.setNumDatanodesPerNameservice(0); // No need for datanodes as we use renewLease() for testing cluster.setNumDatanodesPerNameservice(0); cluster.addRouterOverrides(routerConf); cluster.startCluster(); -// // Making one Namenode active per nameservice -// if (cluster.isHighAvailability()) { -// for (String ns : cluster.getNameservices()) { -// cluster.switchToActive(ns, NAMENODES[0]); -// cluster.switchToStandby(ns, NAMENODES[1]); -// for (int i = 2; i < numberOfNamenode; i++) { -// cluster.switchToObserver(ns, NAMENODES[i]); -// } -// } -// } + // Making one Namenode active per nameservice + if (cluster.isHighAvailability()) { + for (String ns : cluster.getNameservices()) { + List nnList = cluster.getNamenodes(ns); + cluster.switchToActive(ns, nnList.get(0).getNamenodeId()); + cluster.switchToStandby(ns, nnList.get(1).getNamenodeId()); + for (int i = 2; i < numberOfNamenode; i++) { + cluster.switchToObserver(ns, nnList.get(i).getNamenodeId()); + } + } + } cluster.startRouters(); cluster.waitClusterUp(); } /** - * When failover occurs, the router may record that the ns has no active namenode. - * Only when the router updates the cache next time can the memory status be updated, - * causing the router to report NoNamenodesAvailableException for a long time. + * */ @Test - public void testNoNamenodesAvailableLongTimeWhenNsFailover() throws Exception { - setupCluster(1, 2); + public void testCacheShouldNotBeRotated() throws Exception { + setupCluster(1, 0, false); transitionClusterNSToStandby(cluster); - for (MiniRouterDFSCluster.RouterContext routerContext : cluster.getRouters()) { - // Manually trigger the heartbeat - Collection heartbeatServices = routerContext - .getRouter().getNamenodeHeartbeatServices(); - for (NamenodeHeartbeatService service : heartbeatServices) { - service.periodicInvoke(); - } - // Update service cache - routerContext.getRouter().getStateStore().refreshCaches(true); - } + allRoutersHeartbeatAndLoadCache(); // Record the time after the router first updated the cache long firstLoadTime = Time.now(); List namenodes = cluster.getNamenodes(); @@ -122,29 +135,13 @@ public void testNoNamenodesAvailableLongTimeWhenNsFailover() throws Exception { DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf); - for (MiniRouterDFSCluster.RouterContext routerContext : cluster.getRouters()) { - // Get the second namenode in the router cache and make it active - List ns0 = routerContext.getRouter() - .getNamenodeResolver() - .getNamenodesForNameserviceId("ns0", false); - - String nsId = ns0.get(1).getNamenodeId(); - cluster.switchToActive("ns0", nsId); - // Manually trigger the heartbeat, but the router does not manually load the cache - Collection heartbeatServices = routerContext - .getRouter().getNamenodeHeartbeatServices(); - for (NamenodeHeartbeatService service : heartbeatServices) { - service.periodicInvoke(); - } - assertEquals(ACTIVE.ordinal(), - cluster.getNamenode("ns0", nsId).getNamenode().getNameNodeState()); - } - + // Get the second namenode in the router cache and make it active + setSecondNamenodeInTheRouterCacheActive(false); + RouterContext routerContext = cluster.getRouters().get(0); // Get router0 metrics - FederationRPCMetrics rpcMetrics0 = cluster.getRouters().get(0) - .getRouter().getRpcServer().getRPCMetrics(); + FederationRPCMetrics rpcMetrics0 = routerContext.getRouter().getRpcServer().getRPCMetrics(); // Original failures - long originalRouter0Failures = rpcMetrics0.getProxyOpNoNamenodes(); + long originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); /* * At this time, the router has recorded 2 standby namenodes in memory, @@ -152,22 +149,80 @@ public void testNoNamenodesAvailableLongTimeWhenNsFailover() throws Exception { * then an NoNamenodesAvailableException will be reported for the first access, * and the next access will be successful. */ + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); + routerClient.create("/test.txt", true); + assertEquals(originalRouter0NoNamenodesFailures + 1, rpcMetrics0.getProxyOpNoNamenodes()); + originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); + /* + * At this time, the router has recorded 2 standby namenodes in memory, + * we have put the actually active namenode at the front of the cache by rotating the cache. + * Therefore, the access does not cause NoNamenodesAvailableException. + */ + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); + routerClient.setPermission("/test.txt", FsPermission.createImmutable((short)0640)); + assertEquals(originalRouter0NoNamenodesFailures, rpcMetrics0.getProxyOpNoNamenodes()); + + /* + * At this time, the router has recorded 2 standby namenodes in memory, + * we have put the actually active namenode at the front of the cache by rotating the cache. + * If the router sends an illegal operation to active nn, + * NoNamenodesAvailableException will still be reported at this time, + * and the cache should not be rotated due to illegal operations. + */ + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); + String exceptionMessage = "Invalid ACL: " + + "only directories may have a default ACL. Path: /test.txt Path: /test.txt"; + exceptionRule.expect(AclException.class); + exceptionRule.expectMessage(exceptionMessage); + List aclSpec = Lists.newArrayList(aclEntry(DEFAULT, USER, "foo", ALL)); + routerClient.setAcl("/test.txt", aclSpec); + assertEquals(originalRouter0NoNamenodesFailures + 4, rpcMetrics0.getProxyOpNoNamenodes()); + originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); + + // So legal operations can be accessed normally without reporting NoNamenodesAvailableException. + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); routerClient.getFileInfo("/"); - long successReadTime = Time.now(); - assertEquals(originalRouter0Failures + 1, rpcMetrics0.getProxyOpNoNamenodes()); + assertEquals(originalRouter0NoNamenodesFailures, rpcMetrics0.getProxyOpNoNamenodes()); /* - * access the active namenode without waiting for the router to update the cache, + * Access the active namenode without waiting for the router to update the cache, * even if there are 2 standby states recorded in the router memory. */ - assertTrue(successReadTime - firstLoadTime < cluster.getCacheFlushInterval()); + long endTime = Time.now(); + assertTrue(endTime - firstLoadTime < cluster.getCacheFlushInterval()); } @Test - public void testOnn() throws Exception { - setupCluster(1, 2); - MiniRouterDFSCluster.RouterContext router = cluster.getRandomRouter(); + public void testUseObserver() throws Exception { + setupCluster(1, 2, true); + RouterContext router = cluster.getRandomRouter(); + allRoutersHeartbeatAndLoadCache(); + setSecondNamenodeInTheRouterCacheActive(true); + Path path = new Path("/testFile"); + FileSystem fileSystem = router.getFileSystem(); + // Send Create call to active + fileSystem.create(path).close(); + + // Send read request to observer. The router will msync to the active namenode. + fileSystem.open(path).close(); + } + /** + * Determine whether the router has an active namenode. + */ + private boolean routerCacheNoActiveNamenode( + RouterContext context, String nsId, boolean useObserver) throws IOException { + List namenodes + = context.getRouter().getNamenodeResolver().getNamenodesForNameserviceId(nsId, useObserver); + for (FederationNamenodeContext namenode : namenodes) { + if (namenode.getState() == FederationNamenodeServiceState.ACTIVE){ + return false; + } + } + return true; + } + + private void allRoutersHeartbeatAndLoadCache() { for (MiniRouterDFSCluster.RouterContext routerContext : cluster.getRouters()) { // Manually trigger the heartbeat Collection heartbeatServices = routerContext @@ -178,13 +233,28 @@ public void testOnn() throws Exception { // Update service cache routerContext.getRouter().getStateStore().refreshCaches(true); } - Path path = new Path("/testFile"); - FileSystem fileSystem = router.getFileSystem(); - // Send Create call to active - fileSystem.create(path).close(); + } - // Send read request to observer. The router will msync to the active namenode. - fileSystem.open(path).close(); + private void setSecondNamenodeInTheRouterCacheActive(boolean useObserver) throws IOException { + for (RouterContext routerContext : cluster.getRouters()) { + List ns0 = routerContext.getRouter() + .getNamenodeResolver() + .getNamenodesForNameserviceId("ns0", useObserver); + + String nsId = ns0.get(1).getNamenodeId(); + cluster.switchToActive("ns0", nsId); + // Manually trigger the heartbeat, but the router does not manually load the cache + Collection heartbeatServices = routerContext + .getRouter().getNamenodeHeartbeatServices(); + for (NamenodeHeartbeatService service : heartbeatServices) { + service.periodicInvoke(); + } + assertEquals(ACTIVE.ordinal(), + cluster.getNamenode("ns0", nsId).getNamenode().getNameNodeState()); + } } + private void transitionActiveToStandby() { + + } } From ecbd3bf360e15ec885212b1247282ed062bec830 Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Wed, 18 Oct 2023 12:06:03 +0800 Subject: [PATCH 03/10] fix: No Namenodes available when use observer --- .../TestNoNamenodesAvailableLongTime.java | 92 ++++++++++++------- 1 file changed, 59 insertions(+), 33 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java index e69ae5c4d2e8a..6d7d457afa3ca 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -10,7 +10,6 @@ import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.DFSConfigKeys; -import org.apache.hadoop.hdfs.protocol.AclException; import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster; import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster.RouterContext; import org.apache.hadoop.hdfs.server.federation.RouterConfigBuilder; @@ -22,16 +21,13 @@ import org.apache.hadoop.util.Lists; import org.apache.hadoop.util.Time; import org.junit.After; -import org.junit.Rule; import org.junit.Test; -import org.junit.rules.ExpectedException; import java.io.IOException; import java.net.URI; import java.util.Collection; import java.util.List; -import java.util.concurrent.TimeUnit; import static org.apache.hadoop.ha.HAServiceProtocol.HAServiceState.ACTIVE; import static org.apache.hadoop.ha.HAServiceProtocol.HAServiceState.STANDBY; @@ -59,9 +55,6 @@ public void cleanup() { } } - @Rule - public ExpectedException exceptionRule = ExpectedException.none(); - private void setupCluster(int numNameservices, int numberOfObserver, boolean useObserver) throws Exception { if (!useObserver) { @@ -133,50 +126,64 @@ public void testCacheShouldNotBeRotated() throws Exception { // Set dfs.client.failover.random.order false, to pick 1st router at first conf.setBoolean("dfs.client.failover.random.order", false); + // Retries is 2 (see FailoverOnNetworkExceptionRetry#shouldRetry, will fail + // when reties > max.attempts), so total access is 3. + conf.setInt("dfs.client.retry.max.attempts", 1); DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf); // Get the second namenode in the router cache and make it active - setSecondNamenodeInTheRouterCacheActive(false); + setSecondNonObserverNamenodeInTheRouterCacheActive(0, false); + allRoutersHeartbeatNotLoadCache(false); + RouterContext routerContext = cluster.getRouters().get(0); // Get router0 metrics FederationRPCMetrics rpcMetrics0 = routerContext.getRouter().getRpcServer().getRPCMetrics(); // Original failures long originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); + // At this time, the router has recorded 2 standby namenodes in memory. + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); + /* - * At this time, the router has recorded 2 standby namenodes in memory, - * and the first accessed namenode is indeed standby, + * The first accessed namenode is indeed standby, * then an NoNamenodesAvailableException will be reported for the first access, * and the next access will be successful. */ - assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); routerClient.create("/test.txt", true); assertEquals(originalRouter0NoNamenodesFailures + 1, rpcMetrics0.getProxyOpNoNamenodes()); originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); + + // At this time, the router has recorded 2 standby namenodes in memory. + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); + /* - * At this time, the router has recorded 2 standby namenodes in memory, * we have put the actually active namenode at the front of the cache by rotating the cache. * Therefore, the access does not cause NoNamenodesAvailableException. */ - assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); routerClient.setPermission("/test.txt", FsPermission.createImmutable((short)0640)); assertEquals(originalRouter0NoNamenodesFailures, rpcMetrics0.getProxyOpNoNamenodes()); + // At this time, the router has recorded 2 standby namenodes in memory + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); + /* - * At this time, the router has recorded 2 standby namenodes in memory, - * we have put the actually active namenode at the front of the cache by rotating the cache. * If the router sends an illegal operation to active nn, * NoNamenodesAvailableException will still be reported at this time, * and the cache should not be rotated due to illegal operations. + * */ - assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); - String exceptionMessage = "Invalid ACL: " + - "only directories may have a default ACL. Path: /test.txt Path: /test.txt"; - exceptionRule.expect(AclException.class); - exceptionRule.expectMessage(exceptionMessage); List aclSpec = Lists.newArrayList(aclEntry(DEFAULT, USER, "foo", ALL)); - routerClient.setAcl("/test.txt", aclSpec); - assertEquals(originalRouter0NoNamenodesFailures + 4, rpcMetrics0.getProxyOpNoNamenodes()); + try { + routerClient.setAcl("/test.txt", aclSpec); + }catch (RemoteException e) { + assertTrue(e.getMessage().contains( + "org.apache.hadoop.hdfs.server.federation.router.NoNamenodesAvailableException: " + + "No namenodes available under nameservice ns0")); + assertTrue(e.getMessage().contains( + "org.apache.hadoop.hdfs.protocol.AclException: Invalid ACL: " + + "only directories may have a default ACL. Path: /test.txt")); + } + assertEquals(originalRouter0NoNamenodesFailures + 3, rpcMetrics0.getProxyOpNoNamenodes()); originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); // So legal operations can be accessed normally without reporting NoNamenodesAvailableException. @@ -189,6 +196,7 @@ public void testCacheShouldNotBeRotated() throws Exception { * even if there are 2 standby states recorded in the router memory. */ long endTime = Time.now(); + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); assertTrue(endTime - firstLoadTime < cluster.getCacheFlushInterval()); } @@ -197,14 +205,14 @@ public void testUseObserver() throws Exception { setupCluster(1, 2, true); RouterContext router = cluster.getRandomRouter(); allRoutersHeartbeatAndLoadCache(); - setSecondNamenodeInTheRouterCacheActive(true); + transitionActiveToStandby(); + setSecondNonObserverNamenodeInTheRouterCacheActive(2, true); + allRoutersHeartbeatNotLoadCache(true); + + assertTrue(routerCacheNoActiveNamenode(router, "ns0", true)); Path path = new Path("/testFile"); FileSystem fileSystem = router.getFileSystem(); - // Send Create call to active - fileSystem.create(path).close(); - - // Send read request to observer. The router will msync to the active namenode. - fileSystem.open(path).close(); + fileSystem.getFileStatus(path); } /** @@ -235,26 +243,44 @@ private void allRoutersHeartbeatAndLoadCache() { } } - private void setSecondNamenodeInTheRouterCacheActive(boolean useObserver) throws IOException { + private void setSecondNonObserverNamenodeInTheRouterCacheActive( + int numberOfObserver, boolean useObserver) throws IOException { for (RouterContext routerContext : cluster.getRouters()) { List ns0 = routerContext.getRouter() .getNamenodeResolver() .getNamenodesForNameserviceId("ns0", useObserver); - String nsId = ns0.get(1).getNamenodeId(); + String nsId = ns0.get(numberOfObserver+1).getNamenodeId(); cluster.switchToActive("ns0", nsId); + assertEquals(ACTIVE.ordinal(), + cluster.getNamenode("ns0", nsId).getNamenode().getNameNodeState()); + } + } + + private void allRoutersHeartbeatNotLoadCache(boolean useObserver) throws IOException { + for (RouterContext routerContext : cluster.getRouters()) { + List ns0 = routerContext.getRouter() + .getNamenodeResolver() + .getNamenodesForNameserviceId("ns0", useObserver); // Manually trigger the heartbeat, but the router does not manually load the cache Collection heartbeatServices = routerContext .getRouter().getNamenodeHeartbeatServices(); for (NamenodeHeartbeatService service : heartbeatServices) { service.periodicInvoke(); } - assertEquals(ACTIVE.ordinal(), - cluster.getNamenode("ns0", nsId).getNamenode().getNameNodeState()); } } private void transitionActiveToStandby() { - + if (cluster.isHighAvailability()) { + for (String ns : cluster.getNameservices()) { + List nnList = cluster.getNamenodes(ns); + for (MiniRouterDFSCluster.NamenodeContext namenodeContext : nnList) { + if (namenodeContext.getNamenode().isActiveState()) { + cluster.switchToStandby(ns, namenodeContext.getNamenodeId()); + } + } + } + } } } From 00fc68a3ebc12dd5651841dc7d2d300bc1b824ec Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Wed, 18 Oct 2023 17:36:41 +0800 Subject: [PATCH 04/10] fix: No Namenodes available when use observer --- .../TestNoNamenodesAvailableLongTime.java | 55 ++++++++++++------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java index 6d7d457afa3ca..8fc3f9ab03cf6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -35,6 +35,7 @@ import static org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster.DEFAULT_HEARTBEAT_INTERVAL_MS; import static org.apache.hadoop.hdfs.server.namenode.AclTestHelpers.aclEntry; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; /** @@ -131,11 +132,12 @@ public void testCacheShouldNotBeRotated() throws Exception { conf.setInt("dfs.client.retry.max.attempts", 1); DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf); + RouterContext routerContext = cluster.getRouters().get(0); + // Get the second namenode in the router cache and make it active - setSecondNonObserverNamenodeInTheRouterCacheActive(0, false); + setSecondNonObserverNamenodeInTheRouterCacheActive(routerContext, 0, false); allRoutersHeartbeatNotLoadCache(false); - RouterContext routerContext = cluster.getRouters().get(0); // Get router0 metrics FederationRPCMetrics rpcMetrics0 = routerContext.getRouter().getRpcServer().getRPCMetrics(); // Original failures @@ -204,15 +206,32 @@ public void testCacheShouldNotBeRotated() throws Exception { public void testUseObserver() throws Exception { setupCluster(1, 2, true); RouterContext router = cluster.getRandomRouter(); - allRoutersHeartbeatAndLoadCache(); + long activeProxyOps = router.getRouter().getRpcServer() + .getRPCMetrics().getActiveProxyOps(); + + FileSystem fileSystem = router.getFileSystem(); + fileSystem.msync(); + assertEquals(activeProxyOps+1, router.getRouter().getRpcServer() + .getRPCMetrics().getActiveProxyOps()); + transitionActiveToStandby(); - setSecondNonObserverNamenodeInTheRouterCacheActive(2, true); + List namenodes = cluster.getNamenodes(); + + for (MiniRouterDFSCluster.NamenodeContext namenodeContext : namenodes) { + assertNotEquals(ACTIVE.ordinal(), namenodeContext.getNamenode().getNameNodeState()); + } + allRoutersHeartbeatAndLoadCache(); + setSecondNonObserverNamenodeInTheRouterCacheActive(router, 2, true); allRoutersHeartbeatNotLoadCache(true); assertTrue(routerCacheNoActiveNamenode(router, "ns0", true)); - Path path = new Path("/testFile"); - FileSystem fileSystem = router.getFileSystem(); + + long observerProxyOps = router.getRouter().getRpcServer() + .getRPCMetrics().getObserverProxyOps(); + Path path = new Path("/"); fileSystem.getFileStatus(path); + assertEquals(observerProxyOps + 1, router.getRouter().getRpcServer() + .getRPCMetrics().getObserverProxyOps()); } /** @@ -244,24 +263,20 @@ private void allRoutersHeartbeatAndLoadCache() { } private void setSecondNonObserverNamenodeInTheRouterCacheActive( - int numberOfObserver, boolean useObserver) throws IOException { - for (RouterContext routerContext : cluster.getRouters()) { - List ns0 = routerContext.getRouter() - .getNamenodeResolver() - .getNamenodesForNameserviceId("ns0", useObserver); - - String nsId = ns0.get(numberOfObserver+1).getNamenodeId(); - cluster.switchToActive("ns0", nsId); - assertEquals(ACTIVE.ordinal(), - cluster.getNamenode("ns0", nsId).getNamenode().getNameNodeState()); - } + RouterContext routerContext, int numberOfObserver, boolean useObserver) throws IOException { + List ns0 = routerContext.getRouter() + .getNamenodeResolver() + .getNamenodesForNameserviceId("ns0", useObserver); + + String nsId = ns0.get(numberOfObserver+1).getNamenodeId(); + cluster.switchToActive("ns0", nsId); + assertEquals(ACTIVE.ordinal(), + cluster.getNamenode("ns0", nsId).getNamenode().getNameNodeState()); + } private void allRoutersHeartbeatNotLoadCache(boolean useObserver) throws IOException { for (RouterContext routerContext : cluster.getRouters()) { - List ns0 = routerContext.getRouter() - .getNamenodeResolver() - .getNamenodesForNameserviceId("ns0", useObserver); // Manually trigger the heartbeat, but the router does not manually load the cache Collection heartbeatServices = routerContext .getRouter().getNamenodeHeartbeatServices(); From ba468938feb4dd90057f2b8326e09bdad294e3b5 Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Thu, 19 Oct 2023 15:29:12 +0800 Subject: [PATCH 05/10] fix: No Namenodes available when use observer --- .../resolver/MembershipNamenodeResolver.java | 43 ++---- .../federation/MiniRouterDFSCluster.java | 13 ++ .../TestNoNamenodesAvailableLongTime.java | 140 ++++++++++-------- 3 files changed, 103 insertions(+), 93 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java index 3de331a642780..342bc04dbf853 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/resolver/MembershipNamenodeResolver.java @@ -491,43 +491,26 @@ public void setRouterId(String router) { public void rotateCache( String nsId, FederationNamenodeContext namenode, boolean listObserversFirst) { cacheNS.compute(Pair.of(nsId, listObserversFirst), (ns, namenodeContexts) -> { - if (namenodeContexts == null) { + if (namenodeContexts == null || namenodeContexts.size() <= 1) { return namenodeContexts; } - List nonOberverNnContexts = new ArrayList<>(); - List oberverNnContexts = new ArrayList<>(); + for (FederationNamenodeContext namenodeContext : namenodeContexts) { - if (namenodeContext.getState() == OBSERVER) { - oberverNnContexts.add(namenodeContext); - }else { - nonOberverNnContexts.add(namenodeContext); + if (namenodeContext.getState() == ACTIVE) { + return namenodeContexts; } } - if (nonOberverNnContexts.size() <= 1) { - return namenodeContexts; - } - FederationNamenodeContext firstNamenodeContext = nonOberverNnContexts.get(0); - /* - * If the first nn in the cache is active, the active nn priority cannot be lowered. - * This happens when other threads have already updated the cache. - */ - if (firstNamenodeContext.getState().equals(ACTIVE)) { + + FederationNamenodeContext lastNamenode = namenodeContexts.get(namenodeContexts.size()-1); + + if (lastNamenode.getRpcAddress().equals(namenode.getRpcAddress())) { return namenodeContexts; } - /* - * If the first nn in the cache at this time is not the nn - * that needs to be lowered in priority, there is no need to rotate. - * This happens when other threads have already rotated the cache. - */ - if (firstNamenodeContext.getRpcAddress().equals(namenode.getRpcAddress())) { - Collections.rotate(nonOberverNnContexts, -1); - String firstNamenodeId = nonOberverNnContexts.get(0).getNamenodeId(); - LOG.info("Rotate cache of pair , put namenode: {} in the " + - "first position of the cache and namenode: {} in the last position of the cache", - nsId, listObserversFirst, firstNamenodeId, namenode.getNamenodeId()); - oberverNnContexts.addAll(nonOberverNnContexts); - return oberverNnContexts; - } + List rotateNamenodeContexts = + (List) namenodeContexts; + rotateNamenodeContexts.remove(namenode); + rotateNamenodeContexts.add(namenode); + LOG.info("Rotate cache of pair<{}, {}> -> {}", nsId, listObserversFirst, namenodeContexts); return namenodeContexts; }); } diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/MiniRouterDFSCluster.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/MiniRouterDFSCluster.java index 4a023b3c4e035..70d1f808958ed 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/MiniRouterDFSCluster.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/MiniRouterDFSCluster.java @@ -253,6 +253,19 @@ public FileSystem getFileSystemWithObserverReadProxyProvider() throws IOExceptio return DistributedFileSystem.get(observerReadConf); } + public FileSystem getFileSystemWithConfiguredFailoverProxyProvider() throws IOException { + conf.set(DFS_NAMESERVICES, + conf.get(DFS_NAMESERVICES)+ ",router-service"); + conf.set(DFS_HA_NAMENODES_KEY_PREFIX + ".router-service", "router1"); + conf.set(DFS_NAMENODE_RPC_ADDRESS_KEY+ ".router-service.router1", + getFileSystemURI().toString()); + conf.set(HdfsClientConfigKeys.Failover.PROXY_PROVIDER_KEY_PREFIX + + "." + "router-service", ConfiguredFailoverProxyProvider.class.getName()); + DistributedFileSystem.setDefaultUri(conf, "hdfs://router-service"); + + return DistributedFileSystem.get(conf); + } + public DFSClient getClient(UserGroupInformation user) throws IOException, URISyntaxException, InterruptedException { diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java index 8fc3f9ab03cf6..cd348b25f5472 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -17,15 +17,14 @@ import org.apache.hadoop.hdfs.server.federation.metrics.FederationRPCMetrics; import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeContext; import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeServiceState; +import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.util.Lists; -import org.apache.hadoop.util.Time; import org.junit.After; import org.junit.Test; import java.io.IOException; -import java.net.URI; import java.util.Collection; import java.util.List; @@ -45,11 +44,17 @@ */ public class TestNoNamenodesAvailableLongTime { - + private static final long CACHE_FLUSH_INTERVAL_MS = 10000; private StateStoreDFSCluster cluster; + private FileSystem fileSystem; @After - public void cleanup() { + public void cleanup() throws IOException { + if (fileSystem != null) { + fileSystem.close(); + fileSystem = null; + } + if (cluster != null) { cluster.shutdown(); cluster = null; @@ -63,7 +68,7 @@ private void setupCluster(int numNameservices, int numberOfObserver, boolean use } int numberOfNamenode = 2 + numberOfObserver; cluster = new StateStoreDFSCluster(true, numNameservices, numberOfNamenode, - DEFAULT_HEARTBEAT_INTERVAL_MS, 10000); + DEFAULT_HEARTBEAT_INTERVAL_MS, CACHE_FLUSH_INTERVAL_MS); Configuration routerConf = new RouterConfigBuilder() .stateStore() .metrics() @@ -113,9 +118,9 @@ private void setupCluster(int numNameservices, int numberOfObserver, boolean use public void testCacheShouldNotBeRotated() throws Exception { setupCluster(1, 0, false); transitionClusterNSToStandby(cluster); - allRoutersHeartbeatAndLoadCache(); - // Record the time after the router first updated the cache - long firstLoadTime = Time.now(); + allRoutersHeartbeat(); + allRoutersLoadCache(); + List namenodes = cluster.getNamenodes(); // Make sure all namenodes are in standby state @@ -123,37 +128,33 @@ public void testCacheShouldNotBeRotated() throws Exception { assertEquals(STANDBY.ordinal(), namenodeContext.getNamenode().getNameNodeState()); } - Configuration conf = cluster.getRouterClientConf(); - // Set dfs.client.failover.random.order false, to pick 1st router at first - conf.setBoolean("dfs.client.failover.random.order", false); - - // Retries is 2 (see FailoverOnNetworkExceptionRetry#shouldRetry, will fail - // when reties > max.attempts), so total access is 3. - conf.setInt("dfs.client.retry.max.attempts", 1); - DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf); - - RouterContext routerContext = cluster.getRouters().get(0); + RouterContext routerContext = cluster.getRandomRouter(); // Get the second namenode in the router cache and make it active setSecondNonObserverNamenodeInTheRouterCacheActive(routerContext, 0, false); - allRoutersHeartbeatNotLoadCache(false); + allRoutersHeartbeat(); - // Get router0 metrics - FederationRPCMetrics rpcMetrics0 = routerContext.getRouter().getRpcServer().getRPCMetrics(); + // Get router metrics + FederationRPCMetrics rpcMetrics = routerContext.getRouter().getRpcServer().getRPCMetrics(); // Original failures - long originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); + long proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); // At this time, the router has recorded 2 standby namenodes in memory. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); + // Retries is 2 (see FailoverOnNetworkExceptionRetry#shouldRetry, will fail + // when reties > max.attempts), so total access is 3. + routerContext.getConf().setInt("dfs.client.retry.max.attempts", 1); /* * The first accessed namenode is indeed standby, * then an NoNamenodesAvailableException will be reported for the first access, * and the next access will be successful. */ - routerClient.create("/test.txt", true); - assertEquals(originalRouter0NoNamenodesFailures + 1, rpcMetrics0.getProxyOpNoNamenodes()); - originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); + Path path = new Path("/test.file"); + fileSystem = routerContext.getFileSystemWithConfiguredFailoverProxyProvider(); + fileSystem.create(path); + assertEquals(proxyOpNoNamenodes + 1, rpcMetrics.getProxyOpNoNamenodes()); + proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); // At this time, the router has recorded 2 standby namenodes in memory. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); @@ -162,8 +163,8 @@ public void testCacheShouldNotBeRotated() throws Exception { * we have put the actually active namenode at the front of the cache by rotating the cache. * Therefore, the access does not cause NoNamenodesAvailableException. */ - routerClient.setPermission("/test.txt", FsPermission.createImmutable((short)0640)); - assertEquals(originalRouter0NoNamenodesFailures, rpcMetrics0.getProxyOpNoNamenodes()); + fileSystem.setPermission(path, FsPermission.createImmutable((short)0640)); + assertEquals(proxyOpNoNamenodes, rpcMetrics.getProxyOpNoNamenodes()); // At this time, the router has recorded 2 standby namenodes in memory assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); @@ -176,43 +177,29 @@ public void testCacheShouldNotBeRotated() throws Exception { */ List aclSpec = Lists.newArrayList(aclEntry(DEFAULT, USER, "foo", ALL)); try { - routerClient.setAcl("/test.txt", aclSpec); + fileSystem.setAcl(path, aclSpec); }catch (RemoteException e) { assertTrue(e.getMessage().contains( "org.apache.hadoop.hdfs.server.federation.router.NoNamenodesAvailableException: " + "No namenodes available under nameservice ns0")); assertTrue(e.getMessage().contains( "org.apache.hadoop.hdfs.protocol.AclException: Invalid ACL: " + - "only directories may have a default ACL. Path: /test.txt")); + "only directories may have a default ACL. Path: /test.file")); } - assertEquals(originalRouter0NoNamenodesFailures + 3, rpcMetrics0.getProxyOpNoNamenodes()); - originalRouter0NoNamenodesFailures = rpcMetrics0.getProxyOpNoNamenodes(); + assertEquals(proxyOpNoNamenodes + 3, rpcMetrics.getProxyOpNoNamenodes()); + proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); // So legal operations can be accessed normally without reporting NoNamenodesAvailableException. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); - routerClient.getFileInfo("/"); - assertEquals(originalRouter0NoNamenodesFailures, rpcMetrics0.getProxyOpNoNamenodes()); + fileSystem.getFileStatus(path); + assertEquals(proxyOpNoNamenodes, rpcMetrics.getProxyOpNoNamenodes()); - /* - * Access the active namenode without waiting for the router to update the cache, - * even if there are 2 standby states recorded in the router memory. - */ - long endTime = Time.now(); assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); - assertTrue(endTime - firstLoadTime < cluster.getCacheFlushInterval()); } @Test public void testUseObserver() throws Exception { setupCluster(1, 2, true); - RouterContext router = cluster.getRandomRouter(); - long activeProxyOps = router.getRouter().getRpcServer() - .getRPCMetrics().getActiveProxyOps(); - - FileSystem fileSystem = router.getFileSystem(); - fileSystem.msync(); - assertEquals(activeProxyOps+1, router.getRouter().getRpcServer() - .getRPCMetrics().getActiveProxyOps()); transitionActiveToStandby(); List namenodes = cluster.getNamenodes(); @@ -220,18 +207,36 @@ public void testUseObserver() throws Exception { for (MiniRouterDFSCluster.NamenodeContext namenodeContext : namenodes) { assertNotEquals(ACTIVE.ordinal(), namenodeContext.getNamenode().getNameNodeState()); } - allRoutersHeartbeatAndLoadCache(); - setSecondNonObserverNamenodeInTheRouterCacheActive(router, 2, true); - allRoutersHeartbeatNotLoadCache(true); + allRoutersHeartbeat(); + allRoutersLoadCache(); + + RouterContext routerContext = cluster.getRandomRouter(); + setSecondNonObserverNamenodeInTheRouterCacheActive(routerContext, 2, true); + allRoutersHeartbeat(); - assertTrue(routerCacheNoActiveNamenode(router, "ns0", true)); + // Get router metrics + FederationRPCMetrics rpcMetrics = routerContext.getRouter().getRpcServer().getRPCMetrics(); + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); - long observerProxyOps = router.getRouter().getRpcServer() - .getRPCMetrics().getObserverProxyOps(); + fileSystem = routerContext.getFileSystemWithObserverReadProxyProvider(); Path path = new Path("/"); + long observerProxyOps = rpcMetrics.getObserverProxyOps(); fileSystem.getFileStatus(path); - assertEquals(observerProxyOps + 1, router.getRouter().getRpcServer() - .getRPCMetrics().getObserverProxyOps()); + assertEquals(observerProxyOps + 1, rpcMetrics.getObserverProxyOps()); + + stopObserver(2); + long proxyOpFailureOps = rpcMetrics.getProxyOpFailureCommunicate(); + + long proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); + + long standbyProxyOps = rpcMetrics.getProxyOps(); + + fileSystem.getFileStatus(new Path("/")); + assertEquals(proxyOpFailureOps + 2, rpcMetrics.getProxyOpFailureCommunicate()); + assertEquals(proxyOpNoNamenodes + 1, rpcMetrics.getProxyOpNoNamenodes()); + assertEquals(standbyProxyOps + 1, rpcMetrics.getProxyOps()); + + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); } /** @@ -249,14 +254,8 @@ private boolean routerCacheNoActiveNamenode( return true; } - private void allRoutersHeartbeatAndLoadCache() { + private void allRoutersLoadCache() { for (MiniRouterDFSCluster.RouterContext routerContext : cluster.getRouters()) { - // Manually trigger the heartbeat - Collection heartbeatServices = routerContext - .getRouter().getNamenodeHeartbeatServices(); - for (NamenodeHeartbeatService service : heartbeatServices) { - service.periodicInvoke(); - } // Update service cache routerContext.getRouter().getStateStore().refreshCaches(true); } @@ -275,7 +274,7 @@ private void setSecondNonObserverNamenodeInTheRouterCacheActive( } - private void allRoutersHeartbeatNotLoadCache(boolean useObserver) throws IOException { + private void allRoutersHeartbeat() throws IOException { for (RouterContext routerContext : cluster.getRouters()) { // Manually trigger the heartbeat, but the router does not manually load the cache Collection heartbeatServices = routerContext @@ -298,4 +297,19 @@ private void transitionActiveToStandby() { } } } + + private int stopObserver(int num) { + int nnIndex; + for (nnIndex = 0; nnIndex < cluster.getNamenodes().size(); nnIndex++) { + NameNode nameNode = cluster.getCluster().getNameNode(nnIndex); + if (nameNode != null && nameNode.isObserverState()) { + cluster.getCluster().shutdownNameNode(nnIndex); + num--; + if (num == 0) { + break; + } + } + } + return nnIndex; + } } From ab0b073e37260befcecd719050a5b52b4af71408 Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Thu, 19 Oct 2023 15:31:46 +0800 Subject: [PATCH 06/10] fix: No Namenodes available when use observer --- .../federation/router/TestNoNamenodesAvailableLongTime.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java index cd348b25f5472..6147e21b22187 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -8,7 +8,6 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.AclEntry; import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster; import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster.RouterContext; From 8f3caa03a0b1ed190c56ec3d64c1fe0048357494 Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Thu, 19 Oct 2023 17:23:34 +0800 Subject: [PATCH 07/10] fix: No Namenodes available when use observer --- .../TestNoNamenodesAvailableLongTime.java | 87 +++++++++---------- 1 file changed, 41 insertions(+), 46 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java index 6147e21b22187..3343b2157e1d2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -28,8 +28,6 @@ import java.util.List; import static org.apache.hadoop.ha.HAServiceProtocol.HAServiceState.ACTIVE; -import static org.apache.hadoop.ha.HAServiceProtocol.HAServiceState.STANDBY; -import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.transitionClusterNSToStandby;; import static org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster.DEFAULT_HEARTBEAT_INTERVAL_MS; import static org.apache.hadoop.hdfs.server.namenode.AclTestHelpers.aclEntry; import static org.junit.Assert.assertEquals; @@ -46,14 +44,17 @@ public class TestNoNamenodesAvailableLongTime { private static final long CACHE_FLUSH_INTERVAL_MS = 10000; private StateStoreDFSCluster cluster; private FileSystem fileSystem; + private RouterContext routerContext; + private FederationRPCMetrics rpcMetrics; @After public void cleanup() throws IOException { + rpcMetrics = null; + routerContext = null; if (fileSystem != null) { fileSystem.close(); fileSystem = null; } - if (cluster != null) { cluster.shutdown(); cluster = null; @@ -110,13 +111,9 @@ private void setupCluster(int numNameservices, int numberOfObserver, boolean use cluster.waitClusterUp(); } - /** - * - */ - @Test - public void testCacheShouldNotBeRotated() throws Exception { - setupCluster(1, 0, false); - transitionClusterNSToStandby(cluster); + private void initEnv(int numberOfObserver, boolean useObserver) throws Exception { + setupCluster(1, numberOfObserver, useObserver); + transitionActiveToStandby(); allRoutersHeartbeat(); allRoutersLoadCache(); @@ -124,33 +121,49 @@ public void testCacheShouldNotBeRotated() throws Exception { // Make sure all namenodes are in standby state for (MiniRouterDFSCluster.NamenodeContext namenodeContext : namenodes) { - assertEquals(STANDBY.ordinal(), namenodeContext.getNamenode().getNameNodeState()); + assertNotEquals(ACTIVE.ordinal(), namenodeContext.getNamenode().getNameNodeState()); } - RouterContext routerContext = cluster.getRandomRouter(); + routerContext = cluster.getRandomRouter(); // Get the second namenode in the router cache and make it active - setSecondNonObserverNamenodeInTheRouterCacheActive(routerContext, 0, false); + setSecondNonObserverNamenodeInTheRouterCacheActive(numberOfObserver, false); allRoutersHeartbeat(); // Get router metrics - FederationRPCMetrics rpcMetrics = routerContext.getRouter().getRpcServer().getRPCMetrics(); + rpcMetrics = routerContext.getRouter().getRpcServer().getRPCMetrics(); + + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", useObserver)); + + // Retries is 2 (see FailoverOnNetworkExceptionRetry#shouldRetry, will fail + // when reties > max.attempts), so total access is 3. + routerContext.getConf().setInt("dfs.client.retry.max.attempts", 1); + + if (useObserver) { + fileSystem = routerContext.getFileSystemWithObserverReadProxyProvider(); + } else { + fileSystem = routerContext.getFileSystemWithConfiguredFailoverProxyProvider(); + } + } + + /** + * + */ + @Test + public void testCacheShouldNotBeRotated() throws Exception { + initEnv(0, false); // Original failures long proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); // At this time, the router has recorded 2 standby namenodes in memory. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); - // Retries is 2 (see FailoverOnNetworkExceptionRetry#shouldRetry, will fail - // when reties > max.attempts), so total access is 3. - routerContext.getConf().setInt("dfs.client.retry.max.attempts", 1); /* * The first accessed namenode is indeed standby, * then an NoNamenodesAvailableException will be reported for the first access, * and the next access will be successful. */ Path path = new Path("/test.file"); - fileSystem = routerContext.getFileSystemWithConfiguredFailoverProxyProvider(); fileSystem.create(path); assertEquals(proxyOpNoNamenodes + 1, rpcMetrics.getProxyOpNoNamenodes()); proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); @@ -172,7 +185,6 @@ public void testCacheShouldNotBeRotated() throws Exception { * If the router sends an illegal operation to active nn, * NoNamenodesAvailableException will still be reported at this time, * and the cache should not be rotated due to illegal operations. - * */ List aclSpec = Lists.newArrayList(aclEntry(DEFAULT, USER, "foo", ALL)); try { @@ -198,36 +210,19 @@ public void testCacheShouldNotBeRotated() throws Exception { @Test public void testUseObserver() throws Exception { - setupCluster(1, 2, true); - - transitionActiveToStandby(); - List namenodes = cluster.getNamenodes(); - - for (MiniRouterDFSCluster.NamenodeContext namenodeContext : namenodes) { - assertNotEquals(ACTIVE.ordinal(), namenodeContext.getNamenode().getNameNodeState()); - } - allRoutersHeartbeat(); - allRoutersLoadCache(); + initEnv(2, true); - RouterContext routerContext = cluster.getRandomRouter(); - setSecondNonObserverNamenodeInTheRouterCacheActive(routerContext, 2, true); - allRoutersHeartbeat(); - - // Get router metrics - FederationRPCMetrics rpcMetrics = routerContext.getRouter().getRpcServer().getRPCMetrics(); - assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); - - fileSystem = routerContext.getFileSystemWithObserverReadProxyProvider(); Path path = new Path("/"); long observerProxyOps = rpcMetrics.getObserverProxyOps(); + long proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); fileSystem.getFileStatus(path); assertEquals(observerProxyOps + 1, rpcMetrics.getObserverProxyOps()); + assertEquals(proxyOpNoNamenodes + 1, rpcMetrics.getProxyOpNoNamenodes()); stopObserver(2); long proxyOpFailureOps = rpcMetrics.getProxyOpFailureCommunicate(); - - long proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); - + proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); long standbyProxyOps = rpcMetrics.getProxyOps(); fileSystem.getFileStatus(new Path("/")); @@ -254,14 +249,14 @@ private boolean routerCacheNoActiveNamenode( } private void allRoutersLoadCache() { - for (MiniRouterDFSCluster.RouterContext routerContext : cluster.getRouters()) { + for (MiniRouterDFSCluster.RouterContext context : cluster.getRouters()) { // Update service cache - routerContext.getRouter().getStateStore().refreshCaches(true); + context.getRouter().getStateStore().refreshCaches(true); } } private void setSecondNonObserverNamenodeInTheRouterCacheActive( - RouterContext routerContext, int numberOfObserver, boolean useObserver) throws IOException { + int numberOfObserver, boolean useObserver) throws IOException { List ns0 = routerContext.getRouter() .getNamenodeResolver() .getNamenodesForNameserviceId("ns0", useObserver); @@ -274,9 +269,9 @@ private void setSecondNonObserverNamenodeInTheRouterCacheActive( } private void allRoutersHeartbeat() throws IOException { - for (RouterContext routerContext : cluster.getRouters()) { + for (RouterContext context : cluster.getRouters()) { // Manually trigger the heartbeat, but the router does not manually load the cache - Collection heartbeatServices = routerContext + Collection heartbeatServices = context .getRouter().getNamenodeHeartbeatServices(); for (NamenodeHeartbeatService service : heartbeatServices) { service.periodicInvoke(); From f666e1723a5f5ae6b8ce168483d241f5437944a6 Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Fri, 20 Oct 2023 14:17:34 +0800 Subject: [PATCH 08/10] fix: No Namenodes available when use observer --- .../TestNoNamenodesAvailableLongTime.java | 78 ++++++++++++------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java index 3343b2157e1d2..b190faf9e8137 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -41,6 +41,7 @@ */ public class TestNoNamenodesAvailableLongTime { + // router load cache interval 10s private static final long CACHE_FLUSH_INTERVAL_MS = 10000; private StateStoreDFSCluster cluster; private FileSystem fileSystem; @@ -61,6 +62,13 @@ public void cleanup() throws IOException { } } + /** + * Set up state store cluster. + * + * @param numNameservices number of name services + * @param numberOfObserver number of observer + * @param useObserver whether to use observer + */ private void setupCluster(int numNameservices, int numberOfObserver, boolean useObserver) throws Exception { if (!useObserver) { @@ -77,6 +85,7 @@ private void setupCluster(int numNameservices, int numberOfObserver, boolean use .heartbeat() .build(); + // Set router observer related configs if (useObserver) { routerConf.setBoolean(RBFConfigKeys.DFS_ROUTER_OBSERVER_READ_DEFAULT_KEY, true); routerConf.setBoolean(DFSConfigKeys.DFS_HA_TAILEDITS_INPROGRESS_KEY, true); @@ -85,16 +94,13 @@ private void setupCluster(int numNameservices, int numberOfObserver, boolean use // Reduce the number of RPC clients threads to overload the Router easy routerConf.setInt(RBFConfigKeys.DFS_ROUTER_CLIENT_THREADS_SIZE, 4); - // Overload control - routerConf.setBoolean( - RBFConfigKeys.DFS_ROUTER_CLIENT_REJECT_OVERLOAD, false); - // No need for datanodes as we use renewLease() for testing + // No need for datanodes cluster.setNumDatanodesPerNameservice(0); cluster.addRouterOverrides(routerConf); - cluster.startCluster(); + // Making one Namenode active per nameservice if (cluster.isHighAvailability()) { for (String ns : cluster.getNameservices()) { @@ -111,9 +117,16 @@ private void setupCluster(int numNameservices, int numberOfObserver, boolean use cluster.waitClusterUp(); } + /** + * Initialize the test environment and start the cluster so that + * there is no active namenode record in the router cache, + * but the second non-observer namenode in the router cache is actually active. + */ private void initEnv(int numberOfObserver, boolean useObserver) throws Exception { setupCluster(1, numberOfObserver, useObserver); + // Transition all namenodes in the cluster are standby. transitionActiveToStandby(); + // allRoutersHeartbeat(); allRoutersLoadCache(); @@ -146,31 +159,25 @@ private void initEnv(int numberOfObserver, boolean useObserver) throws Exception } } - /** - * - */ @Test - public void testCacheShouldNotBeRotated() throws Exception { + public void testShouldRotatedCache() throws Exception { initEnv(0, false); - // Original failures - long proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); - // At this time, the router has recorded 2 standby namenodes in memory. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); - /* - * The first accessed namenode is indeed standby, - * then an NoNamenodesAvailableException will be reported for the first access, - * and the next access will be successful. - */ Path path = new Path("/test.file"); fileSystem.create(path); - assertEquals(proxyOpNoNamenodes + 1, rpcMetrics.getProxyOpNoNamenodes()); - proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); + assertEquals(1, rpcMetrics.getProxyOpNoNamenodes()); // At this time, the router has recorded 2 standby namenodes in memory. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); + } + @Test + public void testShouldNotBeRotatedCache() throws Exception { + testShouldRotatedCache(); + long proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); + Path path = new Path("/test.file"); /* * we have put the actually active namenode at the front of the cache by rotating the cache. * Therefore, the access does not cause NoNamenodesAvailableException. @@ -213,22 +220,35 @@ public void testUseObserver() throws Exception { initEnv(2, true); Path path = new Path("/"); - long observerProxyOps = rpcMetrics.getObserverProxyOps(); - long proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); fileSystem.getFileStatus(path); - assertEquals(observerProxyOps + 1, rpcMetrics.getObserverProxyOps()); - assertEquals(proxyOpNoNamenodes + 1, rpcMetrics.getProxyOpNoNamenodes()); + assertEquals(1, rpcMetrics.getObserverProxyOps()); + assertEquals(1, rpcMetrics.getProxyOpNoNamenodes()); + + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); + } + + @Test + public void testAtLeastOneObserverNormal() throws Exception { + initEnv(2, true); + stopObserver(1); + fileSystem.getFileStatus(new Path("/")); + assertEquals(1, rpcMetrics.getProxyOpNoNamenodes()); + assertEquals(1, rpcMetrics.getObserverProxyOps()); + + assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); + } + + @Test + public void testAllObserverAbnormality() throws Exception { + initEnv(2, true); stopObserver(2); - long proxyOpFailureOps = rpcMetrics.getProxyOpFailureCommunicate(); - proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); - long standbyProxyOps = rpcMetrics.getProxyOps(); fileSystem.getFileStatus(new Path("/")); - assertEquals(proxyOpFailureOps + 2, rpcMetrics.getProxyOpFailureCommunicate()); - assertEquals(proxyOpNoNamenodes + 1, rpcMetrics.getProxyOpNoNamenodes()); - assertEquals(standbyProxyOps + 1, rpcMetrics.getProxyOps()); + assertEquals(2, rpcMetrics.getProxyOpFailureCommunicate()); + assertEquals(2, rpcMetrics.getProxyOpNoNamenodes()); + assertEquals(2, rpcMetrics.getProxyOps()); assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); } From 060d4fd23743e9990aa4bd492cf080c3b49e4cf1 Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Fri, 20 Oct 2023 16:04:19 +0800 Subject: [PATCH 09/10] fix: No Namenodes available when use observer --- .../federation/router/RouterRpcClient.java | 13 ++- .../TestNoNamenodesAvailableLongTime.java | 108 ++++++++++++++++-- 2 files changed, 108 insertions(+), 13 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java index 67b40a9977e68..c6a56b67540ba 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java @@ -787,7 +787,9 @@ public static boolean isUnavailableException(IOException ioe) { private boolean isClusterUnAvailable( String nsId, FederationNamenodeContext namenode, boolean listObserverFirst) throws IOException { - + // Use observer and the namenode that causes the exception is an observer, + // false is returned so that the oberver can be marked as unavailable,so other observers + // or active namenode which is standby in the cache of the router can be retried. if (listObserverFirst && namenode.getState() == FederationNamenodeServiceState.OBSERVER) { return false; } @@ -1845,7 +1847,14 @@ private LongAccumulator getTimeOfLastCallToActive(String namespaceId) { .computeIfAbsent(namespaceId, key -> new LongAccumulator(Math::max, 0)); } - // todo javadoc + /** + * Determine whether router rotated cache is required when NoNamenodesAvailableException occurs. + * + * @param ioe cause of the NoNamenodesAvailableException. + * @return true if NoNamenodesAvailableException occurs due to + * {@link RouterRpcClient#isUnavailableException(IOException) unavailable exception}, + * otherwise false. + */ private boolean shouldRotateCache(IOException ioe) { if (isUnavailableException(ioe)) { return true; diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java index b190faf9e8137..f6b6c2256a882 100644 --- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java +++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestNoNamenodesAvailableLongTime.java @@ -35,9 +35,12 @@ import static org.junit.Assert.assertTrue; /** - * When failover occurs, the router may record that the ns has no active namenode. + * When failover occurs, the router may record that the ns has no active namenode + * even if there is actually an active namenode. * Only when the router updates the cache next time can the memory status be updated, - * causing the router to report NoNamenodesAvailableException for a long time. + * causing the router to report NoNamenodesAvailableException for a long time, + * + * @see org.apache.hadoop.hdfs.server.federation.router.NoNamenodesAvailableException */ public class TestNoNamenodesAvailableLongTime { @@ -159,20 +162,35 @@ private void initEnv(int numberOfObserver, boolean useObserver) throws Exception } } + /** + * If NoNamenodesAvailableException occurs due to + * {@link RouterRpcClient#isUnavailableException(IOException) unavailable exception}, + * should rotated Cache. + */ @Test public void testShouldRotatedCache() throws Exception { + // 2 namenodes: 1 active, 1 standby. + // But there is no active namenode in router cache. initEnv(0, false); // At this time, the router has recorded 2 standby namenodes in memory. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); Path path = new Path("/test.file"); + // The first create operation will cause NoNamenodesAvailableException and RotatedCache. + // After retrying, create and complete operation will be executed successfully. fileSystem.create(path); assertEquals(1, rpcMetrics.getProxyOpNoNamenodes()); - // At this time, the router has recorded 2 standby namenodes in memory. + // At this time, the router has recorded 2 standby namenodes in memory, + // the operation can be successful without waiting for the router load cache. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); } + /** + * If a request still fails even if it is sent to active, + * then the change operation itself is illegal, + * the cache should not be rotated due to illegal operations. + */ @Test public void testShouldNotBeRotatedCache() throws Exception { testShouldRotatedCache(); @@ -180,7 +198,7 @@ public void testShouldNotBeRotatedCache() throws Exception { Path path = new Path("/test.file"); /* * we have put the actually active namenode at the front of the cache by rotating the cache. - * Therefore, the access does not cause NoNamenodesAvailableException. + * Therefore, the setPermission operation does not cause NoNamenodesAvailableException. */ fileSystem.setPermission(path, FsPermission.createImmutable((short)0640)); assertEquals(proxyOpNoNamenodes, rpcMetrics.getProxyOpNoNamenodes()); @@ -189,9 +207,9 @@ public void testShouldNotBeRotatedCache() throws Exception { assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); /* - * If the router sends an illegal operation to active nn, - * NoNamenodesAvailableException will still be reported at this time, - * and the cache should not be rotated due to illegal operations. + * Even if the router transfers the illegal request to active, + * NoNamenodesAvailableException will still be generated. + * Therefore, rotated cache is not needed. */ List aclSpec = Lists.newArrayList(aclEntry(DEFAULT, USER, "foo", ALL)); try { @@ -204,6 +222,8 @@ public void testShouldNotBeRotatedCache() throws Exception { "org.apache.hadoop.hdfs.protocol.AclException: Invalid ACL: " + "only directories may have a default ACL. Path: /test.file")); } + // Retries is 2 (see FailoverOnNetworkExceptionRetry#shouldRetry, will fail + // when reties > max.attempts), so total access is 3. assertEquals(proxyOpNoNamenodes + 3, rpcMetrics.getProxyOpNoNamenodes()); proxyOpNoNamenodes = rpcMetrics.getProxyOpNoNamenodes(); @@ -212,49 +232,99 @@ public void testShouldNotBeRotatedCache() throws Exception { fileSystem.getFileStatus(path); assertEquals(proxyOpNoNamenodes, rpcMetrics.getProxyOpNoNamenodes()); + // At this time, the router has recorded 2 standby namenodes in memory, + // the operation can be successful without waiting for the router load cache. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", false)); } + /** + * In the observer scenario, NoNamenodesAvailableException occurs, + * the operation can be successful without waiting for the router load cache. + */ @Test public void testUseObserver() throws Exception { + // 4 namenodes: 2 observers, 1 active, 1 standby. + // But there is no active namenode in router cache. initEnv(2, true); Path path = new Path("/"); + // At this time, the router has recorded 2 standby namenodes in memory. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); + + // The first msync operation will cause NoNamenodesAvailableException and RotatedCache. + // After retrying, msync and getFileInfo operation will be executed successfully. fileSystem.getFileStatus(path); assertEquals(1, rpcMetrics.getObserverProxyOps()); assertEquals(1, rpcMetrics.getProxyOpNoNamenodes()); + // At this time, the router has recorded 2 standby namenodes in memory, + // the operation can be successful without waiting for the router load cache. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); } + /** + * In a multi-observer environment, if at least one observer is normal, + * read requests can still succeed even if NoNamenodesAvailableException occurs. + */ @Test public void testAtLeastOneObserverNormal() throws Exception { + // 4 namenodes: 2 observers, 1 active, 1 standby. + // But there is no active namenode in router cache. initEnv(2, true); + // Shutdown one observer. stopObserver(1); + /* + * The first msync operation will cause NoNamenodesAvailableException and RotatedCache. + * After retrying, msync operation will be executed successfully. + * Each read request will shuffle the observer, + * if the getFileInfo operation is sent to the downed observer, + * it will cause NoNamenodesAvailableException, + * at this time, the request can be retried to the normal observer, + * no NoNamenodesAvailableException will be generated and the operation will be successful. + */ fileSystem.getFileStatus(new Path("/")); assertEquals(1, rpcMetrics.getProxyOpNoNamenodes()); assertEquals(1, rpcMetrics.getObserverProxyOps()); + // At this time, the router has recorded 2 standby namenodes in memory, + // the operation can be successful without waiting for the router load cache. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); } + /** + * If all obervers are down, read requests can succeed, + * even if a NoNamenodesAvailableException occurs. + */ @Test public void testAllObserverAbnormality() throws Exception { + // 4 namenodes: 2 observers, 1 active, 1 standby. + // But there is no active namenode in router cache. initEnv(2, true); + // Shutdown all observers. stopObserver(2); + /* + * The first msync operation will cause NoNamenodesAvailableException and RotatedCache. + * After retrying, msync operation will be executed successfully. + * The getFileInfo operation retried 2 namenodes, both causing UnavailableException, + * and continued to retry to the standby namenode, + * causing NoNamenodesAvailableException and RotatedCache, + * and the execution was successful after retrying. + */ fileSystem.getFileStatus(new Path("/")); assertEquals(2, rpcMetrics.getProxyOpFailureCommunicate()); assertEquals(2, rpcMetrics.getProxyOpNoNamenodes()); - assertEquals(2, rpcMetrics.getProxyOps()); + // At this time, the router has recorded 2 standby namenodes in memory, + // the operation can be successful without waiting for the router load cache. assertTrue(routerCacheNoActiveNamenode(routerContext, "ns0", true)); } /** - * Determine whether the router has an active namenode. + * Determine whether cache of the router has an active namenode. + * + * @return true if no active namenode, otherwise false. */ private boolean routerCacheNoActiveNamenode( RouterContext context, String nsId, boolean useObserver) throws IOException { @@ -268,6 +338,9 @@ private boolean routerCacheNoActiveNamenode( return true; } + /** + * All routers in the cluster force loadcache. + */ private void allRoutersLoadCache() { for (MiniRouterDFSCluster.RouterContext context : cluster.getRouters()) { // Update service cache @@ -275,6 +348,9 @@ private void allRoutersLoadCache() { } } + /** + * Set the second non-observer state namenode in the router cache to active. + */ private void setSecondNonObserverNamenodeInTheRouterCacheActive( int numberOfObserver, boolean useObserver) throws IOException { List ns0 = routerContext.getRouter() @@ -288,6 +364,9 @@ private void setSecondNonObserverNamenodeInTheRouterCacheActive( } + /** + * All routers in the cluster force heartbeat. + */ private void allRoutersHeartbeat() throws IOException { for (RouterContext context : cluster.getRouters()) { // Manually trigger the heartbeat, but the router does not manually load the cache @@ -299,6 +378,9 @@ private void allRoutersHeartbeat() throws IOException { } } + /** + * Transition the active namenode in the cluster to standby. + */ private void transitionActiveToStandby() { if (cluster.isHighAvailability()) { for (String ns : cluster.getNameservices()) { @@ -312,7 +394,12 @@ private void transitionActiveToStandby() { } } - private int stopObserver(int num) { + /** + * Shutdown oberver namenode in the cluste. + * + * @param num The number of shutdown oberver. + */ + private void stopObserver(int num) { int nnIndex; for (nnIndex = 0; nnIndex < cluster.getNamenodes().size(); nnIndex++) { NameNode nameNode = cluster.getCluster().getNameNode(nnIndex); @@ -324,6 +411,5 @@ private int stopObserver(int num) { } } } - return nnIndex; } } From 74b746f91a4863a05e9b56912e72b8e6a6e3524a Mon Sep 17 00:00:00 2001 From: Jian Zhang <1361320460@qq.com> Date: Fri, 20 Oct 2023 16:05:50 +0800 Subject: [PATCH 10/10] pom --- hadoop-client-modules/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hadoop-client-modules/pom.xml b/hadoop-client-modules/pom.xml index 907fc1de14674..fb4aedb0aeb43 100644 --- a/hadoop-client-modules/pom.xml +++ b/hadoop-client-modules/pom.xml @@ -37,9 +37,9 @@ hadoop-client-minicluster hadoop-client-check-invariants - + hadoop-client-check-test-invariants - + hadoop-client-integration-tests