Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace MIME parsing with custom HTTP parsing. #200

Merged
merged 2 commits into from
Jul 17, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Changelog

* Made read and write buffer sizes configurable.

* Rewrote HTTP handling for simplicity and performance.

3.3
...

Expand Down
13 changes: 5 additions & 8 deletions websockets/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,10 @@

import asyncio
import collections.abc
import email.message

from .exceptions import InvalidHandshake, InvalidMessage
from .handshake import build_request, check_response
from .http import USER_AGENT, read_response
from .http import USER_AGENT, build_headers, read_response
from .protocol import CONNECTING, OPEN, WebSocketCommonProtocol
from .uri import parse_uri

Expand All @@ -35,9 +34,7 @@ def write_http_request(self, path, headers):

"""
self.path = path
self.request_headers = email.message.Message()
for name, value in headers:
self.request_headers[name] = value
self.request_headers = build_headers(headers)
self.raw_request_headers = headers

# Since the path and headers only contain ASCII characters,
Expand All @@ -63,10 +60,10 @@ def read_http_response(self):
except ValueError as exc:
raise InvalidMessage("Malformed HTTP message") from exc

self.response_headers = headers
self.raw_response_headers = list(headers.raw_items())
self.response_headers = build_headers(headers)
self.raw_response_headers = headers

return status_code, headers
return status_code, self.response_headers

def process_subprotocol(self, get_header, subprotocols=None):
"""
Expand Down
146 changes: 118 additions & 28 deletions websockets/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"""

import asyncio
import email.parser
import io
import http.client
import re
import sys

from .version import version as websockets_version
Expand All @@ -26,6 +26,26 @@
))


# See https://tools.ietf.org/html/rfc7230#appendix-B.

# Regex for validating header names.

_token_re = re.compile(rb'^[-!#$%&\'*+.^_`|~0-9a-zA-Z]+$')

# Regex for validating header values.

# We don't attempt to support obsolete line folding.

# Include HTAB (\x09), SP (\x20), VCHAR (\x21-\x7e), obs-text (\x80-\xff).

# The ABNF is complicated because it attempts to express that optional
# whitespace is ignored. We strip whitespace and don't revalidate that.

# See also https://www.rfc-editor.org/errata_search.php?rfc=7230&eid=4189

_value_re = re.compile(rb'^[\x09\x20-\x7e\x80-\xff]*$')


@asyncio.coroutine
def read_request(stream):
"""
Expand All @@ -34,20 +54,38 @@ def read_request(stream):
``stream`` is an :class:`~asyncio.StreamReader`.

Return ``(path, headers)`` where ``path`` is a :class:`str` and
``headers`` is a :class:`~email.message.Message`. ``path`` isn't
URL-decoded.
``headers`` is a list of ``(name, value)`` tuples.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With a custom class, you can probably go back to returning an object in these (three?) spots.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could, but I like using basic Python types in low level APIs when possible. A list of pairs is pretty manageable.


``path`` isn't URL-decoded or validated in any way.

Non-ASCII characters are represented with surrogate escapes.

Raise an exception if the request isn't well formatted.

The request is assumed not to contain a body.

"""
request_line, headers = yield from read_message(stream)
method, path, version = request_line[:-2].decode().split(None, 2)
if method != 'GET':
raise ValueError("Unsupported method")
if version != 'HTTP/1.1':
raise ValueError("Unsupported HTTP version")
# https://tools.ietf.org/html/rfc7230#section-3.1.1

# Parsing is simple because fixed values are expected for method and
# version and because path isn't checked. Since WebSocket software tends
# to implement HTTP/1.1 strictly, there's little need for lenient parsing.

# Given the implementation of read_line(), request_line ends with CRLF.
request_line = yield from read_line(stream)

# This may raise "ValueError: not enough values to unpack"
method, path, version = request_line[:-2].split(b' ', 2)

if method != b'GET':
raise ValueError("Unsupported HTTP method: %r" % method)
if version != b'HTTP/1.1':
raise ValueError("Unsupported HTTP version: %r" % version)

path = path.decode('ascii', 'surrogateescape')

headers = yield from read_headers(stream)

return path, headers


Expand All @@ -59,45 +97,82 @@ def read_response(stream):
``stream`` is an :class:`~asyncio.StreamReader`.

Return ``(status, headers)`` where ``status`` is a :class:`int` and
``headers`` is a :class:`~email.message.Message`.
``headers`` is a list of ``(name, value)`` tuples.

Non-ASCII characters are represented with surrogate escapes.

Raise an exception if the request isn't well formatted.

The response is assumed not to contain a body.

"""
status_line, headers = yield from read_message(stream)
version, status, reason = status_line[:-2].decode().split(" ", 2)
if version != 'HTTP/1.1':
raise ValueError("Unsupported HTTP version")
return int(status), headers
# https://tools.ietf.org/html/rfc7230#section-3.1.2

# As in read_request, parsing is simple because a fixed value is expected
# for version, status is a 3-digit number, and reason can be ignored.

# Given the implementation of read_line(), status_line ends with CRLF.
status_line = yield from read_line(stream)

# This may raise "ValueError: not enough values to unpack"
version, status, reason = status_line[:-2].split(b' ', 2)

if version != b'HTTP/1.1':
raise ValueError("Unsupported HTTP version: %r" % version)
# This may raise "ValueError: invalid literal for int() with base 10"
status = int(status)
if not 100 <= status < 1000:
raise ValueError("Unsupported HTTP status code: %d" % status)
if not _value_re.match(reason):
raise ValueError("Invalid HTTP reason phrase: %r" % reason)

headers = yield from read_headers(stream)

return status, headers


@asyncio.coroutine
def read_message(stream):
def read_headers(stream):
"""
Read an HTTP message from ``stream``.

``stream`` is an :class:`~asyncio.StreamReader`.

Return ``(start_line, headers)`` where ``start_line`` is :class:`bytes`
and ``headers`` is a :class:`~email.message.Message`.
and ``headers`` is a list of ``(name, value)`` tuples.

Non-ASCII characters are represented with surrogate escapes.

The message is assumed not to contain a body.

"""
start_line = yield from read_line(stream)
header_lines = io.BytesIO()
for num in range(MAX_HEADERS):
header_line = yield from read_line(stream)
header_lines.write(header_line)
if header_line == b'\r\n':
# https://tools.ietf.org/html/rfc7230#section-3.2

# We don't attempt to support obsolete line folding.

headers = []
for _ in range(MAX_HEADERS):
line = yield from read_line(stream)
if line == b'\r\n':
break

# This may raise "ValueError: not enough values to unpack"
name, value = line[:-2].split(b':', 1)
if not _token_re.match(name):
raise ValueError("Invalid HTTP header name: %r" % name)
value = value.strip(b' \t')
if not _value_re.match(value):
raise ValueError("Invalid HTTP header value: %r" % value)

headers.append((
name.decode('ascii'), # guaranteed to be ASCII at this point
value.decode('ascii', 'surrogateescape'),
))

else:
raise ValueError("Too many headers")
header_lines.seek(0)
headers = email.parser.BytesHeaderParser().parse(header_lines)
return start_line, headers
raise ValueError("Too many HTTP headers")

return headers


@asyncio.coroutine
Expand All @@ -108,9 +183,24 @@ def read_line(stream):
``stream`` is an :class:`~asyncio.StreamReader`.

"""
# Security: this is bounded by the StreamReader's limit (default = 32kB).
line = yield from stream.readline()
# Security: this guarantees header values are small (hardcoded = 4kB)
if len(line) > MAX_LINE:
raise ValueError("Line too long")
# Not mandatory but safe - https://tools.ietf.org/html/rfc7230#section-3.5
if not line.endswith(b'\r\n'):
raise ValueError("Line without CRLF")
return line


def build_headers(raw_headers):
"""
Build a date structure for HTTP headers from a list of name - value pairs.

See also https://github.com/aaugustin/websockets/issues/210.

"""
headers = http.client.HTTPMessage()
headers._headers = raw_headers # HACK
return headers
4 changes: 3 additions & 1 deletion websockets/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,13 @@ class WebSocketCommonProtocol(asyncio.StreamReaderProtocol):
processed, the request path is available in the :attr:`path` attribute,
and the request and response HTTP headers are available:

* as a MIME :class:`~email.message.Message` in the :attr:`request_headers`
* as a :class:`~http.client.HTTPMessage` in the :attr:`request_headers`
and :attr:`response_headers` attributes
* as an iterable of (name, value) pairs in the :attr:`raw_request_headers`
and :attr:`raw_response_headers` attributes

These attributes must be treated as immutable.

If a subprotocol was negotiated, it's available in the :attr:`subprotocol`
attribute.

Expand Down
13 changes: 5 additions & 8 deletions websockets/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@

import asyncio
import collections.abc
import email.message
import http
import logging

from .compatibility import asyncio_ensure_future
from .exceptions import InvalidHandshake, InvalidMessage, InvalidOrigin
from .handshake import build_response, check_request
from .http import USER_AGENT, read_request
from .http import USER_AGENT, build_headers, read_request
from .protocol import CONNECTING, OPEN, WebSocketCommonProtocol


Expand Down Expand Up @@ -155,20 +154,18 @@ def read_http_request(self):
raise InvalidMessage("Malformed HTTP message") from exc

self.path = path
self.request_headers = headers
self.raw_request_headers = list(headers.raw_items())
self.request_headers = build_headers(headers)
self.raw_request_headers = headers

return path, headers
return path, self.request_headers

@asyncio.coroutine
def write_http_response(self, status, headers):
"""
Write status line and headers to the HTTP response.

"""
self.response_headers = email.message.Message()
for name, value in headers:
self.response_headers[name] = value
self.response_headers = build_headers(headers)
self.raw_response_headers = headers

# Since the status line and headers only contain ASCII characters,
Expand Down
Loading