Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Python 3: Convert some unicode/bytes uses #3569

Merged
merged 28 commits into from
Aug 1, 2018
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
cb6b689
fix up bytestrings throughout
hawkowl Jul 20, 2018
8801cb8
changelog
hawkowl Jul 20, 2018
e729bdf
fix import
hawkowl Jul 20, 2018
a14df28
scoping is heck
hawkowl Jul 20, 2018
ede1ace
py2 compat
hawkowl Jul 20, 2018
f8172ef
py3 import
hawkowl Jul 20, 2018
df8f3e3
update to fix urllib
hawkowl Jul 20, 2018
e0bf614
encode the hash, too
hawkowl Jul 20, 2018
f0a00f0
fixes
hawkowl Jul 20, 2018
35a41ab
fix
hawkowl Jul 20, 2018
4831ead
isort
hawkowl Jul 20, 2018
f04033e
Merge branch 'develop' of ssh://github.com/matrix-org/synapse into ha…
hawkowl Jul 25, 2018
e1bdb58
review comments
hawkowl Jul 25, 2018
521a920
make auth completely unicode for passwords
hawkowl Jul 25, 2018
58df4a0
encodings
hawkowl Jul 25, 2018
f152316
cleanups
hawkowl Jul 25, 2018
9fc33fd
Merge branch 'develop' of ssh://github.com/matrix-org/synapse into ha…
hawkowl Jul 26, 2018
6040be4
do unicode properly
hawkowl Jul 26, 2018
6745999
return Unicode directly from the JSON encoder
hawkowl Jul 26, 2018
ed08bcb
stylistic cleanups
hawkowl Jul 26, 2018
da3502a
fix sytests
hawkowl Jul 26, 2018
616864e
pep8
hawkowl Jul 26, 2018
e876bcd
type cleanups
hawkowl Jul 27, 2018
d5b735e
fixes
hawkowl Jul 27, 2018
3cc58ea
Merge remote-tracking branch 'origin/develop' into hawkowl/bytes-clean-2
hawkowl Jul 27, 2018
b3a8de6
decode so we always put unicode into the db
hawkowl Aug 1, 2018
bfe288c
Merge remote-tracking branch 'origin/develop' into hawkowl/bytes-clean-2
hawkowl Aug 1, 2018
df8c45a
docstring
hawkowl Aug 1, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/3569.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Unicode passwords are now normalised before hashing, preventing the instance where two different devices or browsers might send a different UTF-8 sequence for the password.
4 changes: 2 additions & 2 deletions synapse/api/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,10 @@ def _get_appservice_user_id(self, request):
if ip_address not in app_service.ip_range_whitelist:
defer.returnValue((None, None))

if "user_id" not in request.args:
if b"user_id" not in request.args:
defer.returnValue((app_service.sender, app_service))

user_id = request.args["user_id"][0]
user_id = request.args[b"user_id"][0].decode('utf8')
if app_service.sender == user_id:
defer.returnValue((app_service.sender, app_service))

Expand Down
2 changes: 1 addition & 1 deletion synapse/federation/transport/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def _parse_auth_header(header_bytes):
param_dict = dict(kv.split("=") for kv in params)

def strip_quotes(value):
if value.startswith(b"\""):
if value.startswith("\""):
return value[1:-1]
else:
return value
Expand Down
19 changes: 16 additions & 3 deletions synapse/handlers/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# limitations under the License.

import logging
import unicodedata

import attr
import bcrypt
Expand Down Expand Up @@ -855,8 +856,16 @@ def hash(self, password):
Deferred(str): Hashed password.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/str/bytes/ ?

"""
def _do_hash():
return bcrypt.hashpw(password.encode('utf8') + self.hs.config.password_pepper,
bcrypt.gensalt(self.bcrypt_rounds))
# Ensure that we normalise the password
if isinstance(password, bytes):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shouldn't we decide if it's meant to be a str or a bytes? or document it as such if it's really meant to be either?

(also, shouldn't this be written:

if isinstance(password, bytes):
    password = password.decode('utf8')
pw = unicodedata.normalize("NFKC", password)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be a nicer way of writing it, but password isn't in a writable scope (because it's inside another function).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wrt the type -- it's being passed in as str, but since it's not clear that it would only be decoded on py2, I've changed it to be a Py3 check.

pw = unicodedata.normalize("NFKC", password.decode('utf8'))
else:
pw = unicodedata.normalize("NFKC", password)

return bcrypt.hashpw(
pw.encode('utf8') + self.hs.config.password_pepper.encode("utf8"),
bcrypt.gensalt(self.bcrypt_rounds),
)

return make_deferred_yieldable(
threads.deferToThreadPool(
Expand All @@ -876,8 +885,12 @@ def validate_hash(self, password, stored_hash):
"""

def _do_validate_hash():
if isinstance(password, bytes):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as above

pw = unicodedata.normalize("NFKC", password.decode('utf8'))
else:
pw = unicodedata.normalize("NFKC", password)
return bcrypt.checkpw(
password.encode('utf8') + self.hs.config.password_pepper,
pw.encode('utf8') + self.hs.config.password_pepper.encode("utf8"),
stored_hash.encode('utf8')
)

Expand Down
24 changes: 17 additions & 7 deletions synapse/http/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import cgi
import collections
import logging
import urllib

from six.moves import http_client
from six import PY3
from six.moves import http_client, urllib

from canonicaljson import encode_canonical_json, encode_pretty_printed_json, json

Expand Down Expand Up @@ -264,6 +265,7 @@ def __init__(self, hs, canonical_json=True):
self.hs = hs

def register_paths(self, method, path_patterns, callback):
method = method.encode("utf-8") # method is bytes on py3
for path_pattern in path_patterns:
logger.debug("Registering for %s %s", method, path_pattern.pattern)
self.path_regexs.setdefault(method, []).append(
Expand Down Expand Up @@ -296,8 +298,14 @@ def _async_render(self, request):
# here. If it throws an exception, that is handled by the wrapper
# installed by @request_handler.

def _parse(s):
if PY3:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why don't we decode when we have PY3?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given the URL encoded sequence string "%E2%98%83", encoded as a unicode string:

Python2:

>>> urllib.unquote(u"%E2%98%83")
u'\xe2\x98\x83'

This is wrong, it returns Unicode but the ASCII escaped character codes.

Python 2:

urllib.unquote(u"%E2%98%83".encode('ascii')).decode('utf8')
u'\u2603'

Correct, returns the Unicode literal, but escaped for display as Py2 will usually not print real Unicode characters by itself.

Python 3:

>>> urllib.parse.unquote(u"%E2%98%83")
'☃'

Correct, returns the Unicode literal (not escaped, as Python 3 has the correct terminal encoding set up).

return urllib.parse.unquote(s)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we not utf-8 decode here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's already decoded by _get_handler_for_request.

else:
return urllib.parse.unquote(s.encode('utf8')).decode('utf8')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are we now encoding this when we weren't before? And why doesn't it need to happen on py3?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We aren't encoding this because we now parse the incoming arguments to Unicode in _get_handler_for_request.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


kwargs = intern_dict({
name: urllib.unquote(value).decode("UTF-8") if value else value
name: _parse(value) if value else value
for name, value in group_dict.items()
})

Expand Down Expand Up @@ -327,7 +335,7 @@ def _get_handler_for_request(self, request):
# Loop through all the registered callbacks to check if the method
# and path regex match
for path_entry in self.path_regexs.get(request.method, []):
m = path_entry.pattern.match(request.path)
m = path_entry.pattern.match(request.path.decode())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

decode() without an explicit encoding makes me twitchy.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

if m:
# We found a match!
return path_entry.callback, m.groupdict()
Expand Down Expand Up @@ -383,7 +391,7 @@ def __init__(self, path):
self.url = path

def render_GET(self, request):
return redirectTo(self.url, request)
return redirectTo(self.url.encode(), request)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

likewise encode()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed


def getChild(self, name, request):
if len(name) == 0:
Expand All @@ -404,12 +412,14 @@ def respond_with_json(request, code, json_object, send_cors=False,
return

if pretty_print:
json_bytes = encode_pretty_printed_json(json_object) + "\n"
json_bytes = (encode_pretty_printed_json(json_object) + "\n"
).encode("utf-8")
else:
if canonical_json or synapse.events.USE_FROZEN_DICTS:
# canonicaljson already encodes to bytes
json_bytes = encode_canonical_json(json_object)
else:
json_bytes = json.dumps(json_object)
json_bytes = json.dumps(json_object).encode("utf-8")

return respond_with_json_bytes(
request, code, json_bytes,
Expand Down
10 changes: 5 additions & 5 deletions synapse/rest/client/v2_alpha/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,15 @@ def __init__(self, hs):
def on_POST(self, request):
body = parse_json_object_from_request(request)

kind = "user"
if "kind" in request.args:
kind = request.args["kind"][0]
kind = b"user"
if b"kind" in request.args:
kind = request.args[b"kind"][0]

if kind == "guest":
if kind == b"guest":
ret = yield self._do_guest_registration(body)
defer.returnValue(ret)
return
elif kind != "user":
elif kind != b"user":
raise UnrecognizedRequestError(
"Do not understand membership kind: %s" % (kind,)
)
Expand Down
2 changes: 1 addition & 1 deletion synapse/rest/media/v1/media_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def ensure_media_is_in_local_cache(self, file_info):
if res:
with res:
consumer = BackgroundFileConsumer(
open(local_path, "w"), self.hs.get_reactor())
open(local_path, "wb"), self.hs.get_reactor())
yield res.write_to_consumer(consumer)
yield consumer.wait()
defer.returnValue(local_path)
Expand Down
2 changes: 1 addition & 1 deletion synapse/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ def resolve_state_groups(

def _ordered_events(events):
def key_func(e):
return -int(e.depth), hashlib.sha1(e.event_id.encode()).hexdigest()
return -int(e.depth), hashlib.sha1(e.event_id.encode('ascii')).hexdigest()

return sorted(events, key=key_func)

Expand Down
15 changes: 9 additions & 6 deletions synapse/storage/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from collections import OrderedDict, deque, namedtuple
from functools import wraps

from six import iteritems, itervalues
from six import PY3, iteritems, itervalues
from six.moves import range

from canonicaljson import json
Expand Down Expand Up @@ -65,7 +65,10 @@


def encode_json(json_object):
return frozendict_json_encoder.encode(json_object)
if PY3:
return frozendict_json_encoder.encode(json_object)
else:
return frozendict_json_encoder.encode(json_object).decode("utf-8")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we only do this for py2?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

json/simplejson returns str (so unicode on py3, bytes on py2). We want it as unicode, so we have to decode it on Py2.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(also the decodes in the functions below have been moved to this one spot)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok comments please to explain this.



class _EventPeristenceQueue(object):
Expand Down Expand Up @@ -981,7 +984,7 @@ def _update_outliers_txn(self, txn, events_and_contexts):

metadata_json = encode_json(
event.internal_metadata.get_dict()
).decode("UTF-8")
)

sql = (
"UPDATE event_json SET internal_metadata = ?"
Expand Down Expand Up @@ -1095,8 +1098,8 @@ def event_dict(event):
"room_id": event.room_id,
"internal_metadata": encode_json(
event.internal_metadata.get_dict()
).decode("UTF-8"),
"json": encode_json(event_dict(event)).decode("UTF-8"),
),
"json": encode_json(event_dict(event)),
}
for event, _ in events_and_contexts
],
Expand All @@ -1115,7 +1118,7 @@ def event_dict(event):
"type": event.type,
"processed": True,
"outlier": event.internal_metadata.is_outlier(),
"content": encode_json(event.content).decode("UTF-8"),
"content": encode_json(event.content),
"origin_server_ts": int(event.origin_server_ts),
"received_ts": self._clock.time_msec(),
"sender": event.sender,
Expand Down
11 changes: 10 additions & 1 deletion synapse/storage/signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,16 @@ def _get_event_reference_hashes_txn(self, txn, event_id):
" WHERE event_id = ?"
)
txn.execute(query, (event_id, ))
return {k: v for k, v in txn}
if six.PY2:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we not use the PY3 path under py2 as well?

return {k: v for k, v in txn}
else:
done = {}
for k, v in txn:
if not isinstance(v, bytes):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does it vary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It shouldn't, so I removed the if.

done[k] = v.encode('ascii')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we update the docstring to note that this returns a dict[str,bytes] please?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

else:
done[k] = v
return done


class SignatureStore(SignatureWorkerStore):
Expand Down
2 changes: 1 addition & 1 deletion synapse/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def __deepcopy__(self, memo):
@classmethod
def from_string(cls, s):
"""Parse the string given by 's' into a structure object."""
if len(s) < 1 or s[0] != cls.SIGIL:
if len(s) < 1 or s[0:1] != cls.SIGIL:
raise SynapseError(400, "Expected %s string to start with '%s'" % (
cls.__name__, cls.SIGIL,
))
Expand Down
6 changes: 3 additions & 3 deletions synapse/util/frozenutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from six import string_types
from six import binary_type, text_type

from canonicaljson import json
from frozendict import frozendict
Expand All @@ -26,7 +26,7 @@ def freeze(o):
if isinstance(o, frozendict):
return o

if isinstance(o, string_types):
if isinstance(o, (binary_type, text_type)):
return o

try:
Expand All @@ -41,7 +41,7 @@ def unfreeze(o):
if isinstance(o, (dict, frozendict)):
return dict({k: unfreeze(v) for k, v in o.items()})

if isinstance(o, string_types):
if isinstance(o, (binary_type, text_type)):
return o

try:
Expand Down
Loading