Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix IP URL previews on Python 3 #4215

Merged
merged 31 commits into from
Dec 21, 2018
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
9e0ae2f
fix
hawkowl Nov 21, 2018
1474953
changelog
hawkowl Nov 21, 2018
31e0fd9
fixes
hawkowl Nov 21, 2018
1520252
fixes
hawkowl Nov 21, 2018
ec14bca
fixes
hawkowl Nov 21, 2018
7af04df
fix pep8
hawkowl Nov 21, 2018
606e39b
fix pep8
hawkowl Nov 21, 2018
759169b
add some code coverage to blacklists
hawkowl Nov 22, 2018
a2f7ae1
tests
hawkowl Nov 26, 2018
a94ca42
Merge remote-tracking branch 'origin/develop' into hawkowl/ip-preview
hawkowl Nov 26, 2018
ca82572
fix pep8
hawkowl Nov 26, 2018
4c246c0
Merge remote-tracking branch 'origin/develop' into hawkowl/ip-preview
hawkowl Nov 28, 2018
fc1dd48
cleanup
hawkowl Nov 28, 2018
c70d2a1
log when we block
hawkowl Nov 30, 2018
5756f64
docstring
hawkowl Nov 30, 2018
0421f0b
docstring
hawkowl Nov 30, 2018
e59a5ee
comment
hawkowl Nov 30, 2018
4357b85
Merge remote-tracking branch 'origin/develop' into hawkowl/ip-preview
hawkowl Dec 5, 2018
d82e498
fix
hawkowl Dec 5, 2018
1210131
merge in tests
hawkowl Dec 5, 2018
6fade49
fix
hawkowl Dec 5, 2018
168a494
fix
hawkowl Dec 5, 2018
920625f
fix
hawkowl Dec 5, 2018
7d17b47
fix py2
hawkowl Dec 5, 2018
1d40dd5
Merge remote-tracking branch 'origin/develop' into hawkowl/ip-preview
hawkowl Dec 14, 2018
79a5655
fix missing hyperlink
hawkowl Dec 14, 2018
aac9bb4
Merge remote-tracking branch 'origin/develop' into hawkowl/ip-preview
hawkowl Dec 20, 2018
34952ce
review comments
hawkowl Dec 20, 2018
9e64ab8
review comments
hawkowl Dec 20, 2018
3b9e190
urllib does what we need here
hawkowl Dec 20, 2018
cfa8f6f
Merge branch 'develop' into hawkowl/ip-preview
hawkowl Dec 21, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@ source=

[report]
precision = 2
ignore_errors = True
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,4 @@ env/

.vscode/
.ropeproject/
.coverage.*
1 change: 1 addition & 0 deletions changelog.d/4215.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Getting URL previews of IP addresses no longer fails on Python 3.
115 changes: 56 additions & 59 deletions synapse/http/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,10 @@
from OpenSSL import SSL
from OpenSSL.SSL import VERIFY_NONE
from twisted.internet import defer, protocol, reactor, ssl
from twisted.internet.endpoints import HostnameEndpoint, wrapClientTLS
from twisted.web._newclient import ResponseDone
from twisted.web.client import (
URI,
Agent,
BrowserLikeRedirectAgent,
ContentDecoderAgent,
GzipDecoder,
HTTPConnectionPool,
PartialDownloadError,
readBody,
Expand All @@ -42,7 +39,6 @@

from synapse.api.errors import Codes, HttpResponseException, SynapseError
from synapse.http import cancelled_to_request_timed_out_error, redact_uri
from synapse.http.endpoint import SpiderEndpoint
from synapse.util.async_helpers import timeout_deferred
from synapse.util.caches import CACHE_SIZE_FACTOR
from synapse.util.logcontext import make_deferred_yieldable
Expand All @@ -59,10 +55,25 @@ class SimpleHttpClient(object):
A simple, no-frills HTTP client with methods that wrap up common ways of
using HTTP in Matrix
"""
def __init__(self, hs):

def __init__(self, hs, treq_args={}, whitelist=None, blacklist=None, _treq=treq):
hawkowl marked this conversation as resolved.
Show resolved Hide resolved
"""
Args:
hs (synapse.server.HomeServer)
treq_args (dict): Extra keyword arguments to be given to treq.request.
blacklist (netaddr.IPSet): The IP addresses that are blacklisted that
we may not request.
whitelist (netaddr.IPSet): The whitelisted IP addresses, that we can
request if it were otherwise caught in a blacklist.
_treq (treq): Treq implementation, can be overridden for testing.
"""
self.hs = hs

pool = HTTPConnectionPool(reactor)
self._treq = _treq
self._extra_treq_args = treq_args
self.whitelist = whitelist
self.blacklist = blacklist

# the pusher makes lots of concurrent SSL connections to sygnal, and
# tends to do so in batches, so we need to allow the pool to keep lots
Expand All @@ -81,13 +92,45 @@ def __init__(self, hs):
)
self.user_agent = hs.version_string
self.clock = hs.get_clock()
self.reactor = hs.get_reactor()
if hs.config.user_agent_suffix:
self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,)

self.user_agent = self.user_agent.encode('ascii')

@defer.inlineCallbacks
def request(self, method, uri, data=b'', headers=None):
"""
Args:
method (str): HTTP method to use.
uri (str): URI to query.
data (bytes): Data to send in the request body, if applicable.
headers (t.w.http_headers.Headers): Request headers.

Raises:
SynapseError: If the IP is blacklisted.
"""
# Check our IP whitelists/blacklists before making the request.
if self.blacklist:
split_uri = URI.fromBytes(uri.encode('utf8'))
address = yield make_deferred_yieldable(
self.reactor.resolve(split_uri.host)
)

from netaddr import IPAddress

ip_address = IPAddress(address)

if ip_address in self.blacklist:
if self.whitelist is None or ip_address not in self.whitelist:
logger.info(
"Blocked accessing %s because of blacklisted IP %s"
% (split_uri.host.decode('utf8'), ip_address)
)
raise SynapseError(
403, "IP address blocked by IP blacklist entry", Codes.UNKNOWN
)

# A small wrapper around self.agent.request() so we can easily attach
# counters to it
outgoing_requests_counter.labels(method).inc()
Expand All @@ -96,8 +139,13 @@ def request(self, method, uri, data=b'', headers=None):
logger.info("Sending request %s %s", method, redact_uri(uri))

try:
request_deferred = treq.request(
method, uri, agent=self.agent, data=data, headers=headers
request_deferred = self._treq.request(
method,
uri,
agent=self.agent,
data=data,
headers=headers,
**self._extra_treq_args
)
request_deferred = timeout_deferred(
request_deferred, 60, self.hs.get_reactor(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all these unrelated formatting changes make this a much bigger diff, so make it much harder to review. pleeeease can we keep formatting and functional changes separate in future?

Expand Down Expand Up @@ -463,57 +511,6 @@ def post_urlencoded_get_raw(self, url, args={}):
defer.returnValue(e.response)


class SpiderEndpointFactory(object):
def __init__(self, hs):
self.blacklist = hs.config.url_preview_ip_range_blacklist
self.whitelist = hs.config.url_preview_ip_range_whitelist
self.policyForHTTPS = hs.get_http_client_context_factory()

def endpointForURI(self, uri):
logger.info("Getting endpoint for %s", uri.toBytes())

if uri.scheme == b"http":
endpoint_factory = HostnameEndpoint
elif uri.scheme == b"https":
tlsCreator = self.policyForHTTPS.creatorForNetloc(uri.host, uri.port)

def endpoint_factory(reactor, host, port, **kw):
return wrapClientTLS(
tlsCreator,
HostnameEndpoint(reactor, host, port, **kw))
else:
logger.warn("Can't get endpoint for unrecognised scheme %s", uri.scheme)
return None
return SpiderEndpoint(
reactor, uri.host, uri.port, self.blacklist, self.whitelist,
endpoint=endpoint_factory, endpoint_kw_args=dict(timeout=15),
)


class SpiderHttpClient(SimpleHttpClient):
"""
Separate HTTP client for spidering arbitrary URLs.
Special in that it follows retries and has a UA that looks
like a browser.

used by the preview_url endpoint in the content repo.
"""
def __init__(self, hs):
SimpleHttpClient.__init__(self, hs)
# clobber the base class's agent and UA:
self.agent = ContentDecoderAgent(
BrowserLikeRedirectAgent(
Agent.usingEndpointFactory(
reactor,
SpiderEndpointFactory(hs)
)
), [(b'gzip', GzipDecoder)]
)
# We could look like Chrome:
# self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko)
# Chrome Safari" % hs.version_string)


def encode_urlencode_args(args):
return {k: encode_urlencode_arg(v) for k, v in args.items()}

Expand Down
35 changes: 0 additions & 35 deletions synapse/http/endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,41 +218,6 @@ def update_request_time(res):
return d


class SpiderEndpoint(object):
"""An endpoint which refuses to connect to blacklisted IP addresses
Implements twisted.internet.interfaces.IStreamClientEndpoint.
"""
def __init__(self, reactor, host, port, blacklist, whitelist,
endpoint=HostnameEndpoint, endpoint_kw_args={}):
self.reactor = reactor
self.host = host
self.port = port
self.blacklist = blacklist
self.whitelist = whitelist
self.endpoint = endpoint
self.endpoint_kw_args = endpoint_kw_args

@defer.inlineCallbacks
def connect(self, protocolFactory):
address = yield self.reactor.resolve(self.host)

from netaddr import IPAddress
ip_address = IPAddress(address)

if ip_address in self.blacklist:
if self.whitelist is None or ip_address not in self.whitelist:
raise ConnectError(
"Refusing to spider blacklisted IP address %s" % address
)

logger.info("Connecting to %s:%s", address, self.port)
endpoint = self.endpoint(
self.reactor, address, self.port, **self.endpoint_kw_args
)
connection = yield endpoint.connect(protocolFactory)
defer.returnValue(connection)


class SRVClientEndpoint(object):
"""An endpoint which looks up SRV records for a service.
Cycles through the list of servers starting with each call to connect
Expand Down
15 changes: 13 additions & 2 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from twisted.web.server import NOT_DONE_YET

from synapse.api.errors import Codes, SynapseError
from synapse.http.client import SpiderHttpClient
from synapse.http.client import SimpleHttpClient
from synapse.http.server import (
respond_with_json,
respond_with_json_bytes,
Expand Down Expand Up @@ -69,7 +69,12 @@ def __init__(self, hs, media_repo, media_storage):
self.max_spider_size = hs.config.max_spider_size
self.server_name = hs.hostname
self.store = hs.get_datastore()
self.client = SpiderHttpClient(hs)
self.client = SimpleHttpClient(
hs,
treq_args={"browser_like_redirects": True},
whitelist=hs.config.url_preview_ip_range_whitelist,
blacklist=hs.config.url_preview_ip_range_blacklist,
)
self.media_repo = media_repo
self.primary_base_path = media_repo.primary_base_path
self.media_storage = media_storage
Expand Down Expand Up @@ -318,6 +323,12 @@ def _download_url(self, url, user):
length, headers, uri, code = yield self.client.get_file(
url, output_stream=f, max_size=self.max_spider_size,
)
except SynapseError:
# Pass SynapseErrors through directly, so that the servlet
# handler will return a SynapseError to the client instead of
# blank data or a 500. Currently, this is only if the IP we are
hawkowl marked this conversation as resolved.
Show resolved Hide resolved
# trying to fetch from is blacklisted.
raise
except Exception as e:
# FIXME: pass through 404s and other error messages nicely
logger.warn("Error downloading %s: %r", url, e)
Expand Down
Loading