From e1a8e450a0c502d7ef38fe3f368d67e1d7775c37 Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Sun, 15 Apr 2018 13:13:03 -0400 Subject: [PATCH 1/3] Avoid self-deadlock in the translog Today when reading an operation from the current generation fails tragically we attempt to close the translog. However, by invoking close before releasing the read lock we end up in self-deadlock because closing tries to acquire the write lock and the read lock can not be upgraded to a write lock. To avoid this, we move the close invocation outside of the try-with-resources that acquired the read lock. As an extra guard against this, we document the problem and add an assertion that we are not trying to invoke close while holding the read lock. --- .../index/translog/Translog.java | 29 +++++++++++++------ .../index/translog/TranslogTests.java | 1 - 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/translog/Translog.java b/server/src/main/java/org/elasticsearch/index/translog/Translog.java index ab4961892ca12..71f741a216b09 100644 --- a/server/src/main/java/org/elasticsearch/index/translog/Translog.java +++ b/server/src/main/java/org/elasticsearch/index/translog/Translog.java @@ -583,12 +583,7 @@ public Operation readOperation(Location location) throws IOException { if (current.generation == location.generation) { // no need to fsync here the read operation will ensure that buffers are written to disk // if they are still in RAM and we are reading onto that position - try { - return current.read(location); - } catch (final Exception ex) { - closeOnTragicEvent(ex); - throw ex; - } + return current.read(location); } else { // read backwards - it's likely we need to read on that is recent for (int i = readers.size() - 1; i >= 0; i--) { @@ -598,6 +593,9 @@ public Operation readOperation(Location location) throws IOException { } } } + } catch (final Exception ex) { + closeOnTragicEvent(ex); + throw ex; } return null; } @@ -735,15 +733,28 @@ public boolean ensureSynced(Stream locations) throws IOException { } } + /** + * Closes the translog if the current translog writer experienced a tragic exception. + * + * Note that in case this thread closes the translog it must not already be holding a read lock on the translog as it will acquire a + * write lock in the course of closing the translog + * + * @param ex if an exception occurs closing the translog, it will be suppressed into the provided exception + */ private void closeOnTragicEvent(final Exception ex) { + // we can not hold a read lock here because closing will attempt to obtain a write lock and that would result in self-deadlock + assert readLock.isHeldByCurrentThread() == false : Thread.currentThread().getName(); if (current.getTragicException() != null) { try { close(); } catch (final AlreadyClosedException inner) { - // don't do anything in this case. The AlreadyClosedException comes from TranslogWriter and we should not add it as suppressed because - // will contain the Exception ex as cause. See also https://github.com/elastic/elasticsearch/issues/15941 + /* + * Don't do anything in this case. The AlreadyClosedException comes from TranslogWriter and we should not add it as + * suppressed because it will contain the provided exception as its cause. See also + * https://github.com/elastic/elasticsearch/issues/15941. + */ } catch (final Exception inner) { - assert (ex != inner.getCause()); + assert (ex != null && ex != inner.getCause()); ex.addSuppressed(inner); } } diff --git a/server/src/test/java/org/elasticsearch/index/translog/TranslogTests.java b/server/src/test/java/org/elasticsearch/index/translog/TranslogTests.java index 8899dca24b146..b3b9fca886e17 100644 --- a/server/src/test/java/org/elasticsearch/index/translog/TranslogTests.java +++ b/server/src/test/java/org/elasticsearch/index/translog/TranslogTests.java @@ -1812,7 +1812,6 @@ public void testTragicEventCanBeAnyException() throws IOException { assertTrue(translog.getTragicException() instanceof UnknownException); } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/29509") public void testFatalIOExceptionsWhileWritingConcurrently() throws IOException, InterruptedException { Path tempDir = createTempDir(); final FailSwitch fail = new FailSwitch(); From bbce75b760954b364fd283ac262ca55951fd6a91 Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Sun, 15 Apr 2018 13:19:40 -0400 Subject: [PATCH 2/3] Upgrade assertion to hard exception, we will throw anyway --- .../main/java/org/elasticsearch/index/translog/Translog.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/translog/Translog.java b/server/src/main/java/org/elasticsearch/index/translog/Translog.java index 71f741a216b09..b12386042cfa8 100644 --- a/server/src/main/java/org/elasticsearch/index/translog/Translog.java +++ b/server/src/main/java/org/elasticsearch/index/translog/Translog.java @@ -744,6 +744,7 @@ public boolean ensureSynced(Stream locations) throws IOException { private void closeOnTragicEvent(final Exception ex) { // we can not hold a read lock here because closing will attempt to obtain a write lock and that would result in self-deadlock assert readLock.isHeldByCurrentThread() == false : Thread.currentThread().getName(); + Objects.requireNonNull(ex); if (current.getTragicException() != null) { try { close(); @@ -754,7 +755,7 @@ private void closeOnTragicEvent(final Exception ex) { * https://github.com/elastic/elasticsearch/issues/15941. */ } catch (final Exception inner) { - assert (ex != null && ex != inner.getCause()); + assert ex != inner.getCause(); ex.addSuppressed(inner); } } From cd503c45bd681eaff54ec355f46b9090702d5c9b Mon Sep 17 00:00:00 2001 From: Jason Tedor Date: Sun, 15 Apr 2018 14:54:27 -0400 Subject: [PATCH 3/3] Remove null check --- .../src/main/java/org/elasticsearch/index/translog/Translog.java | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/translog/Translog.java b/server/src/main/java/org/elasticsearch/index/translog/Translog.java index b12386042cfa8..cc5041bf24434 100644 --- a/server/src/main/java/org/elasticsearch/index/translog/Translog.java +++ b/server/src/main/java/org/elasticsearch/index/translog/Translog.java @@ -744,7 +744,6 @@ public boolean ensureSynced(Stream locations) throws IOException { private void closeOnTragicEvent(final Exception ex) { // we can not hold a read lock here because closing will attempt to obtain a write lock and that would result in self-deadlock assert readLock.isHeldByCurrentThread() == false : Thread.currentThread().getName(); - Objects.requireNonNull(ex); if (current.getTragicException() != null) { try { close();