Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HBASE-27968 add JvmPauseMonitor in hbase-client #5319

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import static org.apache.hadoop.hbase.client.NonceGenerator.CLIENT_NONCES_ENABLED_KEY;
import static org.apache.hadoop.hbase.trace.HBaseSemanticAttributes.SERVER_NAME_KEY;
import static org.apache.hadoop.hbase.util.FutureUtils.addListener;
import static org.apache.hadoop.hbase.util.JvmPauseMonitor.PAUSE_MONITOR_ENABLE_DEFAULT;
import static org.apache.hadoop.hbase.util.JvmPauseMonitor.PAUSE_MONITOR_ENABLE_KEY;

import io.opentelemetry.api.trace.Span;
import java.io.IOException;
Expand Down Expand Up @@ -55,6 +57,7 @@
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.trace.TraceUtil;
import org.apache.hadoop.hbase.util.ConcurrentMapUtils;
import org.apache.hadoop.hbase.util.JvmPauseMonitor;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.yetus.audience.InterfaceAudience;
Expand Down Expand Up @@ -121,6 +124,8 @@ public class AsyncConnectionImpl implements AsyncConnection {
private final String metricsScope;
private final Optional<MetricsConnection> metrics;

private JvmPauseMonitor pauseMonitor;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make this a singleton, instead? In the case of applications opening or pooling multiple connections, we don't need a separate JvmPauseMonitor per connection, just a single one for the whole application.

Copy link
Contributor Author

@frostruan frostruan Jul 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea ! But if so, we may need to consider adding a reference count to pauseMonitor. Only when all connections that refer to it are no longer active, we can stop it . Or we can keep it all the time , even all the connections are closed ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think we don't need to stop it when connection is closed, it may be useful to have it running for as long the client application runs.


private final ClusterStatusListener clusterStatusListener;

private volatile ConnectionOverAsyncConnection conn;
Expand Down Expand Up @@ -179,6 +184,10 @@ public void newDead(ServerName sn) {
}
}
this.clusterStatusListener = listener;
if (conf.getBoolean(PAUSE_MONITOR_ENABLE_KEY, PAUSE_MONITOR_ENABLE_DEFAULT)) {
pauseMonitor = JvmPauseMonitor.getInstance(conf);
pauseMonitor.start();
}
}

private void spawnRenewalChore(final UserGroupInformation user) {
Expand Down Expand Up @@ -240,6 +249,9 @@ public void close() {
if (metrics.isPresent()) {
MetricsConnection.deleteMetricsConnection(metricsScope);
}
if (pauseMonitor != null) {
pauseMonitor.stop();
}
ConnectionOverAsyncConnection c = this.conn;
if (c != null) {
c.closePool();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.metrics.JvmPauseMonitorSource;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hbase.thirdparty.com.google.common.base.Stopwatch;
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
import org.apache.hbase.thirdparty.com.google.common.collect.Maps;
Expand All @@ -46,9 +46,13 @@
* which detects GC pauses(Todd Lipcon)
*/
@InterfaceAudience.Private
public class JvmPauseMonitor {
public final class JvmPauseMonitor {
private static final Logger LOG = LoggerFactory.getLogger(JvmPauseMonitor.class);

private static final AtomicInteger REF_CNT = new AtomicInteger();

private static JvmPauseMonitor INSTANCE;

/** The target sleep time */
private static final long SLEEP_INTERVAL_MS = 500;

Expand All @@ -62,28 +66,47 @@ public class JvmPauseMonitor {
public static final String INFO_THRESHOLD_KEY = "jvm.pause.info-threshold.ms";
private static final long INFO_THRESHOLD_DEFAULT = 1000;

public static final String PAUSE_MONITOR_ENABLE_KEY = "hbase.client.pause.monitor.enable";

public static final boolean PAUSE_MONITOR_ENABLE_DEFAULT = false;

private Thread monitorThread;
private volatile boolean shouldRun = true;
private JvmPauseMonitorSource metricsSource;
private final JvmPauseMonitorSource metricsSource;

public static synchronized JvmPauseMonitor getInstance(Configuration conf) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why change it to singleton? What if we pass different Configuration here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two reasons.

@wchevreuil mentioned that our user may open multiple connections, however only one pause monitor is enough for a process. More details please see here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we only want one JvmPauseMonitor in process, then we should not bind it to a connection? Just add a utility method to enable jvm pause monitor is enough?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Let me try to fix this. Thanks Duo ! @Apache9

return getInstance(conf, null);
}

public JvmPauseMonitor(Configuration conf) {
this(conf, null);
public static synchronized JvmPauseMonitor getInstance(Configuration conf,
JvmPauseMonitorSource metricsSource) {
if (INSTANCE == null) {
INSTANCE = new JvmPauseMonitor(conf, metricsSource);
}
return INSTANCE;
}

public JvmPauseMonitor(Configuration conf, JvmPauseMonitorSource metricsSource) {
private JvmPauseMonitor(Configuration conf, JvmPauseMonitorSource metricsSource) {
this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
this.metricsSource = metricsSource;
}

public void start() {
Preconditions.checkState(monitorThread == null, "Already started");
public synchronized void start() {
if (REF_CNT.getAndIncrement() > 0) {
// pause monitor already started
return;
}
monitorThread = new Thread(new Monitor(), "JvmPauseMonitor");
monitorThread.setDaemon(true);
monitorThread.start();
}

public void stop() {
public synchronized void stop() {
if (REF_CNT.decrementAndGet() > 0) {
// there are still open connections
return;
}
shouldRun = false;
monitorThread.interrupt();
try {
Expand Down Expand Up @@ -113,7 +136,7 @@ private Map<String, GcTimes> getGcTimes() {
return map;
}

private static class GcTimes {
private static final class GcTimes {
private GcTimes(GarbageCollectorMXBean gcBean) {
gcCount = gcBean.getCollectionCount();
gcTimeMillis = gcBean.getCollectionTime();
Expand Down Expand Up @@ -196,17 +219,13 @@ public JvmPauseMonitorSource getMetricsSource() {
return metricsSource;
}

public void setMetricsSource(JvmPauseMonitorSource metricsSource) {
this.metricsSource = metricsSource;
}

/**
* Simple 'main' to facilitate manual testing of the pause monitor. This main function just leaks
* memory into a list. Running this class with a 1GB heap will very quickly go into "GC hell" and
* result in log messages about the GC pauses.
*/
public static void main(String[] args) throws Exception {
new JvmPauseMonitor(new Configuration()).start();
JvmPauseMonitor.getInstance(new Configuration()).start();
List<String> list = Lists.newArrayList();
int i = 0;
while (true) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
*/
package org.apache.hadoop.hbase.rest;

import static org.apache.hadoop.hbase.util.JvmPauseMonitor.PAUSE_MONITOR_ENABLE_DEFAULT;
import static org.apache.hadoop.hbase.util.JvmPauseMonitor.PAUSE_MONITOR_ENABLE_KEY;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Admin;
Expand Down Expand Up @@ -95,6 +98,11 @@ public synchronized static void stop() {
RESTServlet(final Configuration conf, final UserProvider userProvider) throws IOException {
this.realUser = userProvider.getCurrent().getUGI();
this.conf = conf;

// disable the pause monitor in connection because we will create a pause
// monitor that reports metric to JvmPauseMonitorSource
this.conf.setBoolean(PAUSE_MONITOR_ENABLE_KEY, PAUSE_MONITOR_ENABLE_DEFAULT);

registerCustomFilter(conf);

int cleanInterval = conf.getInt(CLEANUP_INTERVAL, 10 * 1000);
Expand All @@ -106,7 +114,7 @@ public synchronized static void stop() {

metrics = new MetricsREST();

pauseMonitor = new JvmPauseMonitor(conf, metrics.getSource());
pauseMonitor = JvmPauseMonitor.getInstance(conf, metrics.getSource());
pauseMonitor.start();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import static org.apache.hadoop.hbase.ChoreService.DEFAULT_CHORE_SERVICE_INITIAL_POOL_SIZE;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK;
import static org.apache.hadoop.hbase.HConstants.HBASE_SPLIT_WAL_COORDINATED_BY_ZK;
import static org.apache.hadoop.hbase.util.JvmPauseMonitor.PAUSE_MONITOR_ENABLE_DEFAULT;
import static org.apache.hadoop.hbase.util.JvmPauseMonitor.PAUSE_MONITOR_ENABLE_KEY;

import com.google.errorprone.annotations.RestrictedApi;
import io.opentelemetry.api.trace.Span;
Expand Down Expand Up @@ -255,6 +257,10 @@ public HBaseServerBase(Configuration conf, String name) throws IOException {
useThisHostnameInstead = getUseThisHostnameInstead(conf);
InetSocketAddress addr = rpcServices.getSocketAddress();

// disable the pause monitor in connection because we will create a pause
// monitor that reports metric to JvmPauseMonitorSource
this.conf.setBoolean(PAUSE_MONITOR_ENABLE_KEY, PAUSE_MONITOR_ENABLE_DEFAULT);

// if use-ip is enabled, we will use ip to expose Master/RS service for client,
// see HBASE-27304 for details.
boolean useIp = conf.getBoolean(HConstants.HBASE_SERVER_USEIP_ENABLED_KEY,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1437,7 +1437,7 @@ protected void handleReportForDutyResponse(final RegionServerStartupResponse c)
this.metricsRegionServer =
new MetricsRegionServer(metricsRegionServerImpl, conf, metricsTable);
// Now that we have a metrics source, start the pause monitor
this.pauseMonitor = new JvmPauseMonitor(conf, getMetrics().getMetricsSource());
this.pauseMonitor = JvmPauseMonitor.getInstance(conf, getMetrics().getMetricsSource());
pauseMonitor.start();

// There is a rare case where we do NOT want services to start. Check config.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ public void testPauseMonitor() {
Configuration conf = new Configuration();
conf.setLong(JvmPauseMonitor.INFO_THRESHOLD_KEY, 1000L);
conf.setLong(JvmPauseMonitor.WARN_THRESHOLD_KEY, 10000L);
JvmPauseMonitor monitor = new JvmPauseMonitor(conf, serverSource);
JvmPauseMonitor monitor = JvmPauseMonitor.getInstance(conf, serverSource);
monitor.updateMetrics(1500, false);
HELPER.assertCounter("pauseInfoThresholdExceeded", 1, serverSource);
HELPER.assertCounter("pauseWarnThresholdExceeded", 0, serverSource);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@
import static org.apache.hadoop.hbase.thrift.Constants.THRIFT_SSL_KEYSTORE_TYPE_KEY;
import static org.apache.hadoop.hbase.thrift.Constants.THRIFT_SUPPORT_PROXYUSER_KEY;
import static org.apache.hadoop.hbase.thrift.Constants.USE_HTTP_CONF_KEY;
import static org.apache.hadoop.hbase.util.JvmPauseMonitor.PAUSE_MONITOR_ENABLE_DEFAULT;
import static org.apache.hadoop.hbase.util.JvmPauseMonitor.PAUSE_MONITOR_ENABLE_KEY;

import java.io.IOException;
import java.net.InetAddress;
Expand Down Expand Up @@ -194,6 +196,9 @@ public class ThriftServer extends Configured implements Tool {

public ThriftServer(Configuration conf) {
this.conf = HBaseConfiguration.create(conf);
// disable the pause monitor in connection because we will create a pause
// monitor that reports metric to JvmPauseMonitorSource
this.conf.setBoolean(PAUSE_MONITOR_ENABLE_KEY, PAUSE_MONITOR_ENABLE_DEFAULT);
}

protected ThriftMetrics createThriftMetrics(Configuration conf) {
Expand Down Expand Up @@ -226,7 +231,7 @@ protected void setupParamters() throws IOException {

this.listenPort = conf.getInt(PORT_CONF_KEY, DEFAULT_LISTEN_PORT);
this.metrics = createThriftMetrics(conf);
this.pauseMonitor = new JvmPauseMonitor(conf, this.metrics.getSource());
this.pauseMonitor = JvmPauseMonitor.getInstance(conf, this.metrics.getSource());
this.hbaseServiceHandler = createHandler(conf, userProvider);
this.hbaseServiceHandler.initMetrics(metrics);
this.processor = createProcessor();
Expand Down