From fcfe7a850dada2e65982f2bc805d8bc409f07512 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 14 Sep 2018 19:23:07 +0100 Subject: [PATCH 1/2] Add an awful secondary timeout to fix wedged requests This is an attempt to mitigate #3842 by adding yet-another-timeout --- synapse/http/matrixfederationclient.py | 11 ++++++ synapse/util/async_helpers.py | 51 ++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index da16b5dd8cd7..13b19f7626f7 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -42,6 +42,7 @@ ) from synapse.http.endpoint import matrix_federation_endpoint from synapse.util import logcontext +from synapse.util.async_helpers import timeout_no_seriously from synapse.util.logcontext import make_deferred_yieldable from synapse.util.metrics import Measure @@ -228,6 +229,16 @@ def _request(self, destination, method, path, ) request_deferred.addTimeout(_sec_timeout, self.hs.get_reactor()) + # Sometimes the timeout above doesn't work, so lets hack yet + # another layer of timeouts in in the vain hope that at some + # point the world made sense and this really really really + # should work. + request_deferred = timeout_no_seriously( + request_deferred, + timeout=_sec_timeout * 2, + reactor=self.hs.get_reactor(), + ) + with Measure(self.clock, "outbound_request"): response = yield make_deferred_yieldable( request_deferred, diff --git a/synapse/util/async_helpers.py b/synapse/util/async_helpers.py index 9b3f2f4b96df..083e4f412861 100644 --- a/synapse/util/async_helpers.py +++ b/synapse/util/async_helpers.py @@ -438,3 +438,54 @@ def _cancelled_to_timed_out_error(value, timeout): value.trap(CancelledError) raise DeferredTimeoutError(timeout, "Deferred") return value + + +def timeout_no_seriously(deferred, timeout, reactor): + """The in build twisted deferred addTimeout (and the method above) + completely fail to time things out under some unknown circumstances. + + Lets try a different way of timing things out and maybe that will make + things work?! + + TODO: Kill this with fire. + """ + + new_d = defer.Deferred() + + timed_out = [False] + + def time_it_out(): + timed_out[0] = True + deferred.cancel() + + if not new_d.called: + new_d.errback(DeferredTimeoutError(timeout, "Deferred")) + + delayed_call = reactor.callLater(timeout, time_it_out) + + def convert_cancelled(value): + if timed_out[0]: + return _cancelled_to_timed_out_error(value, timeout) + return value + + deferred.addBoth(convert_cancelled) + + def cancel_timeout(result): + # stop the pending call to cancel the deferred if it's been fired + if delayed_call.active(): + delayed_call.cancel() + return result + + deferred.addBoth(cancel_timeout) + + def success_cb(val): + if not new_d.called: + new_d.callback(val) + + def failure_cb(val): + if not new_d.called: + new_d.errback(val) + + deferred.addCallbacks(success_cb, failure_cb) + + return new_d From 335b23a078b416b3ea97d1bef73da6ca5abd8d4b Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 14 Sep 2018 19:25:58 +0100 Subject: [PATCH 2/2] Newsfile --- changelog.d/3875.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/3875.bugfix diff --git a/changelog.d/3875.bugfix b/changelog.d/3875.bugfix new file mode 100644 index 000000000000..2d2147dd4bc7 --- /dev/null +++ b/changelog.d/3875.bugfix @@ -0,0 +1 @@ +Mitigate outbound federation randomly becoming wedged