Skip to content

Commit

Permalink
[Broker] Make health check fail if dead locked threads are detected (a…
Browse files Browse the repository at this point in the history
…pache#15155)

* [Broker] Make health check fail if dead locked threads are detected

* Add unit test for detecting a dead lock

* Use lockInterruptibly to unlock the deadlock and wait for threads to finish

* Add test for testing the deadlock detection overhead

(cherry picked from commit df0c110)
(cherry picked from commit 6b163f1)
  • Loading branch information
lhotari committed Apr 22, 2022
1 parent 346a8a9 commit e18ec3a
Show file tree
Hide file tree
Showing 4 changed files with 279 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ static String callDiagnosticCommand(String operationName, String... args)

static String buildDeadlockInfo() {
ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
long[] threadIds = threadBean.findMonitorDeadlockedThreads();
long[] threadIds = threadBean.findDeadlockedThreads();
if (threadIds != null && threadIds.length > 0) {
StringWriter stringWriter = new StringWriter();
PrintWriter out = new PrintWriter(stringWriter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,12 @@
import io.swagger.annotations.ApiOperation;
import io.swagger.annotations.ApiResponse;
import io.swagger.annotations.ApiResponses;
import java.lang.management.ManagementFactory;
import java.lang.management.ThreadInfo;
import java.lang.management.ThreadMXBean;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
Expand All @@ -34,6 +38,7 @@
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;
import javax.ws.rs.DELETE;
import javax.ws.rs.GET;
import javax.ws.rs.POST;
Expand Down Expand Up @@ -64,6 +69,7 @@
import org.apache.pulsar.common.policies.data.BrokerInfo;
import org.apache.pulsar.common.policies.data.NamespaceOwnershipStatus;
import org.apache.pulsar.common.util.FutureUtil;
import org.apache.pulsar.common.util.ThreadDumpUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -74,6 +80,10 @@ public class BrokersBase extends PulsarWebResource {
private static final Logger LOG = LoggerFactory.getLogger(BrokersBase.class);
private static final Duration HEALTHCHECK_READ_TIMEOUT = Duration.ofSeconds(10);
public static final String HEALTH_CHECK_TOPIC_SUFFIX = "healthcheck";
// log a full thread dump when a deadlock is detected in healthcheck once every 10 minutes
// to prevent excessive logging
private static final long LOG_THREADDUMP_INTERVAL_WHEN_DEADLOCK_DETECTED = 600000L;
private volatile long threadDumpLoggedTimestamp;

@GET
@Path("/{cluster}")
Expand Down Expand Up @@ -307,6 +317,7 @@ public void isReady(@Suspended AsyncResponse asyncResponse) {
@ApiResponse(code = 500, message = "Internal server error")})
public void healthcheck(@Suspended AsyncResponse asyncResponse) {
validateSuperUserAccess();
checkDeadlockedThreads();
internalRunHealthCheck()
.thenAccept(__ -> {
LOG.info("[{}] Successfully run health check.", clientAppId());
Expand All @@ -318,6 +329,27 @@ public void healthcheck(@Suspended AsyncResponse asyncResponse) {
});
}

private void checkDeadlockedThreads() {
ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
long[] threadIds = threadBean.findDeadlockedThreads();
if (threadIds != null && threadIds.length > 0) {
ThreadInfo[] threadInfos = threadBean.getThreadInfo(threadIds, false, false);
String threadNames = Arrays.stream(threadInfos)
.map(threadInfo -> threadInfo.getThreadName() + "(tid=" + threadInfo.getThreadId() + ")").collect(
Collectors.joining(", "));
if (System.currentTimeMillis() - threadDumpLoggedTimestamp
> LOG_THREADDUMP_INTERVAL_WHEN_DEADLOCK_DETECTED) {
threadDumpLoggedTimestamp = System.currentTimeMillis();
LOG.error("Deadlocked threads detected. {}\n{}", threadNames,
ThreadDumpUtil.buildThreadDiagnosticString());
} else {
LOG.error("Deadlocked threads detected. {}", threadNames);
}
throw new IllegalStateException("Deadlocked threads detected. " + threadNames);
}
}


private CompletableFuture<Void> internalRunHealthCheck() {
String topic;
PulsarClient client;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@
package org.apache.pulsar.broker.admin;

import com.google.common.collect.Sets;
import java.lang.management.ManagementFactory;
import java.lang.management.ThreadMXBean;
import java.time.Duration;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Phaser;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.stream.Collectors;
import org.apache.commons.collections.CollectionUtils;
import org.apache.pulsar.broker.auth.MockedPulsarServiceBaseTest;
Expand All @@ -36,6 +42,8 @@
@Test(groups = "broker")
public class AdminApiHealthCheckTest extends MockedPulsarServiceBaseTest {

private final ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();

@BeforeMethod
@Override
public void setup() throws Exception {
Expand Down Expand Up @@ -88,4 +96,73 @@ public void testHealthCheckup() throws Exception {
))
);
}

@Test(expectedExceptions= PulsarAdminException.class)
public void testHealthCheckupDetectsDeadlock() throws Exception {
// simulate a deadlock in the Test JVM
// the broker used in unit tests runs in the test JVM and the
// healthcheck implementation should detect this deadlock
Lock lock1 = new ReentrantReadWriteLock().writeLock();
Lock lock2 = new ReentrantReadWriteLock().writeLock();
final Phaser phaser = new Phaser(3);
Thread thread1=new Thread(() -> {
phaser.arriveAndAwaitAdvance();
try {
deadlock(lock1, lock2, 1000L);
} finally {
phaser.arriveAndDeregister();
}
}, "deadlockthread-1");
Thread thread2=new Thread(() -> {
phaser.arriveAndAwaitAdvance();
try {
deadlock(lock2, lock1, 2000L);
} finally {
phaser.arriveAndDeregister();
}
}, "deadlockthread-2");
thread1.start();
thread2.start();
phaser.arriveAndAwaitAdvance();
Thread.sleep(5000L);

try {
admin.brokers().healthcheck();
} finally {
// unlock the deadlock
thread1.interrupt();
thread2.interrupt();
// wait for deadlock threads to finish
phaser.arriveAndAwaitAdvance();
// wait for deadlocked status to clear before continuing
Awaitility.await().atMost(Duration.ofSeconds(10))
.until(() -> threadBean.findDeadlockedThreads() == null);
}
}

private void deadlock(Lock lock1, Lock lock2, long millis) {
try {
lock1.lockInterruptibly();
try {
Thread.sleep(millis);
lock2.lockInterruptibly();
lock2.unlock();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
lock1.unlock();
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}

@Test(timeOut = 5000L)
public void testDeadlockDetectionOverhead() {
for (int i=0; i < 1000; i++) {
long[] threadIds = threadBean.findDeadlockedThreads();
// assert that there's no deadlock
Assert.assertNull(threadIds);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pulsar.common.util;

import java.io.PrintWriter;
import java.io.StringWriter;
import java.lang.management.LockInfo;
import java.lang.management.ManagementFactory;
import java.lang.management.MonitorInfo;
import java.lang.management.ThreadInfo;
import java.lang.management.ThreadMXBean;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Map;
import javax.management.JMException;
import javax.management.ObjectName;

/**
* Adapted from Hadoop TimedOutTestsListener.
*
* https://raw.githubusercontent.com/apache/hadoop/master/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/test/TimedOutTestsListener.java
*/
public class ThreadDumpUtil {
private static final String INDENT = " ";

public static String buildThreadDiagnosticString() {
StringWriter sw = new StringWriter();
PrintWriter output = new PrintWriter(sw);

output.println(buildThreadDump());

String deadlocksInfo = buildDeadlockInfo();
if (deadlocksInfo != null) {
output.println("====> DEADLOCKS DETECTED <====");
output.println();
output.println(deadlocksInfo);
}

return sw.toString();
}

static String buildThreadDump() {
try {
// first attempt to use jcmd to do the thread dump, similar output to jstack
return callDiagnosticCommand("threadPrint", "-l");
} catch (Exception ignore) {
}

// fallback to using JMX for creating the thread dump
StringBuilder dump = new StringBuilder();

dump.append(String.format("Timestamp: %s", DateTimeFormatter.ISO_OFFSET_DATE_TIME.format(LocalDateTime.now())));
dump.append("\n\n");

Map<Thread, StackTraceElement[]> stackTraces = Thread.getAllStackTraces();
for (Map.Entry<Thread, StackTraceElement[]> e : stackTraces.entrySet()) {
Thread thread = e.getKey();
dump.append('\n');
dump.append(String.format("\"%s\" %s prio=%d tid=%d %s%njava.lang.Thread.State: %s", thread.getName(),
(thread.isDaemon() ? "daemon" : ""), thread.getPriority(), thread.getId(),
Thread.State.WAITING.equals(thread.getState()) ? "in Object.wait()" : thread.getState().name(),
Thread.State.WAITING.equals(thread.getState()) ? "WAITING (on object monitor)"
: thread.getState()));
for (StackTraceElement stackTraceElement : e.getValue()) {
dump.append("\n at ");
dump.append(stackTraceElement);
}
dump.append("\n");
}
return dump.toString();
}

/**
* Calls a diagnostic commands.
* The available operations are similar to what the jcmd commandline tool has,
* however the naming of the operations are different. The "help" operation can be used
* to find out the available operations. For example, the jcmd command "Thread.print" maps
* to "threadPrint" operation name.
*/
static String callDiagnosticCommand(String operationName, String... args)
throws JMException {
return (String) ManagementFactory.getPlatformMBeanServer()
.invoke(new ObjectName("com.sun.management:type=DiagnosticCommand"),
operationName, new Object[]{args}, new String[]{String[].class.getName()});
}

static String buildDeadlockInfo() {
ThreadMXBean threadBean = ManagementFactory.getThreadMXBean();
long[] threadIds = threadBean.findDeadlockedThreads();
if (threadIds != null && threadIds.length > 0) {
StringWriter stringWriter = new StringWriter();
PrintWriter out = new PrintWriter(stringWriter);

ThreadInfo[] infos = threadBean.getThreadInfo(threadIds, true, true);
for (ThreadInfo ti : infos) {
printThreadInfo(ti, out);
printLockInfo(ti.getLockedSynchronizers(), out);
out.println();
}

out.close();
return stringWriter.toString();
} else {
return null;
}
}

private static void printThreadInfo(ThreadInfo ti, PrintWriter out) {
// print thread information
printThread(ti, out);

// print stack trace with locks
StackTraceElement[] stacktrace = ti.getStackTrace();
MonitorInfo[] monitors = ti.getLockedMonitors();
for (int i = 0; i < stacktrace.length; i++) {
StackTraceElement ste = stacktrace[i];
out.println(INDENT + "at " + ste.toString());
for (MonitorInfo mi : monitors) {
if (mi.getLockedStackDepth() == i) {
out.println(INDENT + " - locked " + mi);
}
}
}
out.println();
}

private static void printThread(ThreadInfo ti, PrintWriter out) {
out.println();
out.print("\"" + ti.getThreadName() + "\"" + " Id=" + ti.getThreadId() + " in " + ti.getThreadState());
if (ti.getLockName() != null) {
out.print(" on lock=" + ti.getLockName());
}
if (ti.isSuspended()) {
out.print(" (suspended)");
}
if (ti.isInNative()) {
out.print(" (running in native)");
}
out.println();
if (ti.getLockOwnerName() != null) {
out.println(INDENT + " owned by " + ti.getLockOwnerName() + " Id=" + ti.getLockOwnerId());
}
}

private static void printLockInfo(LockInfo[] locks, PrintWriter out) {
out.println(INDENT + "Locked synchronizers: count = " + locks.length);
for (LockInfo li : locks) {
out.println(INDENT + " - " + li);
}
out.println();
}

}

0 comments on commit e18ec3a

Please sign in to comment.