diff --git a/changelog.d/4539.misc b/changelog.d/4539.misc new file mode 100644 index 000000000000..b222c8d23f75 --- /dev/null +++ b/changelog.d/4539.misc @@ -0,0 +1 @@ +Update federation routing logic to check .well-known before SRV diff --git a/changelog.d/4542.misc b/changelog.d/4542.misc new file mode 100644 index 000000000000..74c84e02092e --- /dev/null +++ b/changelog.d/4542.misc @@ -0,0 +1 @@ +Improve performance of handling servers with invalid .well-known diff --git a/synapse/http/federation/matrix_federation_agent.py b/synapse/http/federation/matrix_federation_agent.py index 26649e70be3a..50baa8bf0a66 100644 --- a/synapse/http/federation/matrix_federation_agent.py +++ b/synapse/http/federation/matrix_federation_agent.py @@ -23,14 +23,17 @@ from twisted.internet import defer from twisted.internet.endpoints import HostnameEndpoint, wrapClientTLS +from twisted.internet.interfaces import IStreamClientEndpoint from twisted.web.client import URI, Agent, HTTPConnectionPool, RedirectAgent, readBody from twisted.web.http import stringToDatetime from twisted.web.http_headers import Headers from twisted.web.iweb import IAgent from synapse.http.federation.srv_resolver import SrvResolver, pick_server_from_list +from synapse.util import Clock from synapse.util.caches.ttlcache import TTLCache from synapse.util.logcontext import make_deferred_yieldable +from synapse.util.metrics import Measure # period to cache .well-known results for by default WELL_KNOWN_DEFAULT_CACHE_PERIOD = 24 * 3600 @@ -44,6 +47,8 @@ # cap for .well-known cache period WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600 +# magic value to mark an invalid well-known +INVALID_WELL_KNOWN = object() logger = logging.getLogger(__name__) well_known_cache = TTLCache('well-known') @@ -78,6 +83,8 @@ def __init__( _well_known_cache=well_known_cache, ): self._reactor = reactor + self._clock = Clock(reactor) + self._tls_client_options_factory = tls_client_options_factory if _srv_resolver is None: _srv_resolver = SrvResolver() @@ -98,6 +105,11 @@ def __init__( ) self._well_known_agent = _well_known_agent + # our cache of .well-known lookup results, mapping from server name + # to delegated name. The values can be: + # `bytes`: a valid server-name + # `None`: there is no .well-known here + # INVALID_WELL_KNWOWN: the .well-known here is invalid self._well_known_cache = _well_known_cache @defer.inlineCallbacks @@ -152,12 +164,9 @@ def request(self, method, uri, headers=None, bodyProducer=None): class EndpointFactory(object): @staticmethod def endpointForURI(_uri): - logger.info( - "Connecting to %s:%i", - res.target_host.decode("ascii"), - res.target_port, + ep = LoggingHostnameEndpoint( + self._reactor, res.target_host, res.target_port, ) - ep = HostnameEndpoint(self._reactor, res.target_host, res.target_port) if tls_options is not None: ep = wrapClientTLS(tls_options, ep) return ep @@ -210,11 +219,7 @@ def _route_matrix_uri(self, parsed_uri, lookup_well_known=True): target_port=parsed_uri.port, )) - # try a SRV lookup - service_name = b"_matrix._tcp.%s" % (parsed_uri.host,) - server_list = yield self._srv_resolver.resolve_service(service_name) - - if not server_list and lookup_well_known: + if lookup_well_known: # try a .well-known lookup well_known_server = yield self._get_well_known(parsed_uri.host) @@ -250,6 +255,10 @@ def _route_matrix_uri(self, parsed_uri, lookup_well_known=True): res = yield self._route_matrix_uri(new_uri, lookup_well_known=False) defer.returnValue(res) + # try a SRV lookup + service_name = b"_matrix._tcp.%s" % (parsed_uri.host,) + server_list = yield self._srv_resolver.resolve_service(service_name) + if not server_list: target_host = parsed_uri.host port = 8448 @@ -283,14 +292,35 @@ def _get_well_known(self, server_name): None if there was no .well-known file. """ try: - cached = self._well_known_cache[server_name] - defer.returnValue(cached) + result = self._well_known_cache[server_name] except KeyError: - pass + # TODO: should we linearise so that we don't end up doing two .well-known + # requests for the same server in parallel? + with Measure(self._clock, "get_well_known"): + result, cache_period = yield self._do_get_well_known(server_name) + + if cache_period > 0: + self._well_known_cache.set(server_name, result, cache_period) + + if result == INVALID_WELL_KNOWN: + raise Exception("invalid .well-known on this server") + + defer.returnValue(result) - # TODO: should we linearise so that we don't end up doing two .well-known requests - # for the same server in parallel? + @defer.inlineCallbacks + def _do_get_well_known(self, server_name): + """Actually fetch and parse a .well-known, without checking the cache + Args: + server_name (bytes): name of the server, from the requested url + + Returns: + Deferred[Tuple[bytes|None|object],int]: + result, cache period, where result is one of: + - the new server name from the .well-known (as a `bytes`) + - None if there was no .well-known file. + - INVALID_WELL_KNOWN if the .well-known was invalid + """ uri = b"https://%s/.well-known/matrix/server" % (server_name, ) uri_str = uri.decode("ascii") logger.info("Fetching %s", uri_str) @@ -308,9 +338,7 @@ def _get_well_known(self, server_name): # after startup cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER) - - self._well_known_cache.set(server_name, None, cache_period) - defer.returnValue(None) + defer.returnValue((None, cache_period)) try: parsed_body = json.loads(body.decode('utf-8')) @@ -320,7 +348,10 @@ def _get_well_known(self, server_name): if "m.server" not in parsed_body: raise Exception("Missing key 'm.server'") except Exception as e: - raise Exception("invalid .well-known response from %s: %s" % (uri_str, e,)) + logger.info("invalid .well-known response from %s: %s", uri_str, e) + cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD + cache_period += random.uniform(0, WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER) + defer.returnValue((INVALID_WELL_KNOWN, cache_period)) result = parsed_body["m.server"].encode("ascii") @@ -336,10 +367,20 @@ def _get_well_known(self, server_name): else: cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD) - if cache_period > 0: - self._well_known_cache.set(server_name, result, cache_period) + defer.returnValue((result, cache_period)) - defer.returnValue(result) + +@implementer(IStreamClientEndpoint) +class LoggingHostnameEndpoint(object): + """A wrapper for HostnameEndpint which logs when it connects""" + def __init__(self, reactor, host, port, *args, **kwargs): + self.host = host + self.port = port + self.ep = HostnameEndpoint(reactor, host, port, *args, **kwargs) + + def connect(self, protocol_factory): + logger.info("Connecting to %s:%i", self.host, self.port) + return self.ep.connect(protocol_factory) def _cache_period_from_headers(headers, time_now=time.time): diff --git a/tests/http/federation/test_matrix_federation_agent.py b/tests/http/federation/test_matrix_federation_agent.py index 7b2800021fa3..369e63ecc623 100644 --- a/tests/http/federation/test_matrix_federation_agent.py +++ b/tests/http/federation/test_matrix_federation_agent.py @@ -358,9 +358,8 @@ def test_get_no_srv_no_well_known(self): # Nothing happened yet self.assertNoResult(test_d) - self.mock_resolver.resolve_service.assert_called_once_with( - b"_matrix._tcp.testserv", - ) + # No SRV record lookup yet + self.mock_resolver.resolve_service.assert_not_called() # there should be an attempt to connect on port 443 for the .well-known clients = self.reactor.tcpClients @@ -376,6 +375,11 @@ def test_get_no_srv_no_well_known(self): # .well-known request fails. self.reactor.pump((0.4,)) + # now there should be a SRV lookup + self.mock_resolver.resolve_service.assert_called_once_with( + b"_matrix._tcp.testserv", + ) + # we should fall back to a direct connection self.assertEqual(len(clients), 2) (host, port, client_factory, _timeout, _bindAddress) = clients[1] @@ -403,8 +407,7 @@ def test_get_no_srv_no_well_known(self): self.successResultOf(test_d) def test_get_well_known(self): - """Test the behaviour when the server name has no port and no SRV record, but - the .well-known redirects elsewhere + """Test the behaviour when the .well-known redirects elsewhere """ self.mock_resolver.resolve_service.side_effect = lambda _: [] @@ -416,11 +419,6 @@ def test_get_well_known(self): # Nothing happened yet self.assertNoResult(test_d) - self.mock_resolver.resolve_service.assert_called_once_with( - b"_matrix._tcp.testserv", - ) - self.mock_resolver.resolve_service.reset_mock() - # there should be an attempt to connect on port 443 for the .well-known clients = self.reactor.tcpClients self.assertEqual(len(clients), 1) @@ -432,7 +430,7 @@ def test_get_well_known(self): client_factory, expected_sni=b"testserv", target_server=b"target-server", ) - # there should be another SRV lookup + # there should be a SRV lookup self.mock_resolver.resolve_service.assert_called_once_with( b"_matrix._tcp.target-server", ) @@ -483,11 +481,6 @@ def test_get_well_known_redirect(self): # Nothing happened yet self.assertNoResult(test_d) - self.mock_resolver.resolve_service.assert_called_once_with( - b"_matrix._tcp.testserv", - ) - self.mock_resolver.resolve_service.reset_mock() - # there should be an attempt to connect on port 443 for the .well-known clients = self.reactor.tcpClients self.assertEqual(len(clients), 1) @@ -529,7 +522,7 @@ def test_get_well_known_redirect(self): self.reactor.pump((0.1, )) - # there should be another SRV lookup + # there should be a SRV lookup self.mock_resolver.resolve_service.assert_called_once_with( b"_matrix._tcp.target-server", ) @@ -581,6 +574,7 @@ def test_get_hostname_srv(self): # Nothing happened yet self.assertNoResult(test_d) + # the request for a .well-known will have failed with a DNS lookup error. self.mock_resolver.resolve_service.assert_called_once_with( b"_matrix._tcp.testserv", ) @@ -613,11 +607,9 @@ def test_get_hostname_srv(self): self.successResultOf(test_d) def test_get_well_known_srv(self): - """Test the behaviour when the server name has no port and no SRV record, but - the .well-known redirects to a place where there is a SRV. + """Test the behaviour when the .well-known redirects to a place where there + is a SRV. """ - - self.mock_resolver.resolve_service.side_effect = lambda _: [] self.reactor.lookups["testserv"] = "1.2.3.4" self.reactor.lookups["srvtarget"] = "5.6.7.8" @@ -626,11 +618,6 @@ def test_get_well_known_srv(self): # Nothing happened yet self.assertNoResult(test_d) - self.mock_resolver.resolve_service.assert_called_once_with( - b"_matrix._tcp.testserv", - ) - self.mock_resolver.resolve_service.reset_mock() - # there should be an attempt to connect on port 443 for the .well-known clients = self.reactor.tcpClients self.assertEqual(len(clients), 1) @@ -646,7 +633,7 @@ def test_get_well_known_srv(self): client_factory, expected_sni=b"testserv", target_server=b"target-server", ) - # there should be another SRV lookup + # there should be a SRV lookup self.mock_resolver.resolve_service.assert_called_once_with( b"_matrix._tcp.target-server", ) @@ -691,9 +678,8 @@ def test_idna_servername(self): # Nothing happened yet self.assertNoResult(test_d) - self.mock_resolver.resolve_service.assert_called_once_with( - b"_matrix._tcp.xn--bcher-kva.com", - ) + # No SRV record lookup yet + self.mock_resolver.resolve_service.assert_not_called() # there should be an attempt to connect on port 443 for the .well-known clients = self.reactor.tcpClients @@ -709,6 +695,11 @@ def test_idna_servername(self): # .well-known request fails. self.reactor.pump((0.4,)) + # now there should have been a SRV lookup + self.mock_resolver.resolve_service.assert_called_once_with( + b"_matrix._tcp.xn--bcher-kva.com", + ) + # We should fall back to port 8448 clients = self.reactor.tcpClients self.assertEqual(len(clients), 2)