-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Python 3: Convert some unicode/bytes uses #3569
Changes from 16 commits
cb6b689
8801cb8
e729bdf
a14df28
ede1ace
f8172ef
df8f3e3
e0bf614
f0a00f0
35a41ab
4831ead
f04033e
e1bdb58
521a920
58df4a0
f152316
9fc33fd
6040be4
6745999
ed08bcb
da3502a
616864e
e876bcd
d5b735e
3cc58ea
b3a8de6
bfe288c
df8c45a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Unicode passwords are now normalised before hashing, preventing the instance where two different devices or browsers might send a different UTF-8 sequence for the password. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,9 @@ | |
# limitations under the License. | ||
|
||
import logging | ||
import unicodedata | ||
|
||
from six import PY2 | ||
|
||
import attr | ||
import bcrypt | ||
|
@@ -626,6 +629,10 @@ def validate_login(self, username, login_submission): | |
# special case to check for "password" for the check_password interface | ||
# for the auth providers | ||
password = login_submission.get("password") | ||
|
||
if password and PY2: | ||
password = password.decode('utf8') | ||
|
||
if login_type == LoginType.PASSWORD: | ||
if not self._password_enabled: | ||
raise SynapseError(400, "Password login has been disabled.") | ||
|
@@ -708,6 +715,7 @@ def _check_local_password(self, user_id, password): | |
|
||
Args: | ||
user_id (str): complete @user:id | ||
password (unicode): the provided password | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it might be worth clarifying that it will be a
(and similar on Of course what's really happening is that elsewhere we have been sloppy about saying There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's clearer to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, as discussed, |
||
Returns: | ||
(str) the canonical_user_id, or None if unknown user / bad password | ||
""" | ||
|
@@ -849,14 +857,19 @@ def hash(self, password): | |
"""Computes a secure hash of password. | ||
|
||
Args: | ||
password (str): Password to hash. | ||
password (unicode): Password to hash. | ||
|
||
Returns: | ||
Deferred(str): Hashed password. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/str/bytes/ ? |
||
""" | ||
def _do_hash(): | ||
return bcrypt.hashpw(password.encode('utf8') + self.hs.config.password_pepper, | ||
bcrypt.gensalt(self.bcrypt_rounds)) | ||
# Normalise the Unicode in the password | ||
pw = unicodedata.normalize("NFKC", password) | ||
|
||
return bcrypt.hashpw( | ||
pw.encode('utf8') + self.hs.config.password_pepper.encode("utf8"), | ||
bcrypt.gensalt(self.bcrypt_rounds), | ||
) | ||
|
||
return make_deferred_yieldable( | ||
threads.deferToThreadPool( | ||
|
@@ -868,16 +881,19 @@ def validate_hash(self, password, stored_hash): | |
"""Validates that self.hash(password) == stored_hash. | ||
|
||
Args: | ||
password (str): Password to hash. | ||
password (unicode): Password to hash. | ||
stored_hash (str): Expected hash value. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/str/bytes/ ? |
||
|
||
Returns: | ||
Deferred(bool): Whether self.hash(password) == stored_hash. | ||
""" | ||
|
||
def _do_validate_hash(): | ||
# Normalise the Unicode in the password | ||
pw = unicodedata.normalize("NFKC", password) | ||
|
||
return bcrypt.checkpw( | ||
password.encode('utf8') + self.hs.config.password_pepper, | ||
pw.encode('utf8') + self.hs.config.password_pepper.encode("utf8"), | ||
stored_hash.encode('utf8') | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,12 +13,13 @@ | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import cgi | ||
import collections | ||
import logging | ||
import urllib | ||
|
||
from six.moves import http_client | ||
from six import PY3 | ||
from six.moves import http_client, urllib | ||
|
||
from canonicaljson import encode_canonical_json, encode_pretty_printed_json, json | ||
|
||
|
@@ -264,6 +265,7 @@ def __init__(self, hs, canonical_json=True): | |
self.hs = hs | ||
|
||
def register_paths(self, method, path_patterns, callback): | ||
method = method.encode("utf-8") # method is bytes on py3 | ||
for path_pattern in path_patterns: | ||
logger.debug("Registering for %s %s", method, path_pattern.pattern) | ||
self.path_regexs.setdefault(method, []).append( | ||
|
@@ -296,8 +298,14 @@ def _async_render(self, request): | |
# here. If it throws an exception, that is handled by the wrapper | ||
# installed by @request_handler. | ||
|
||
def _unquote(s): | ||
if PY3: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why don't we decode when we have PY3? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the URL encoded sequence string "%E2%98%83", encoded as a unicode string: Python2:
This is wrong, it returns Unicode but the ASCII escaped character codes. Python 2:
Correct, returns the Unicode literal, but escaped for display as Py2 will usually not print real Unicode characters by itself. Python 3:
Correct, returns the Unicode literal (not escaped, as Python 3 has the correct terminal encoding set up). |
||
return urllib.parse.unquote(s) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we not utf-8 decode here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's already decoded by |
||
else: | ||
return urllib.parse.unquote(s.encode('ascii')).decode('utf8') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. URLs are 7-bit ASCII. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes they are, but that's not the point. the question is why we do the encoding here and not for python3. Likewise, I still don't understand why we decode the result on py2 and not on python3. Is it that whatever the answer is, could we have some explanation in comments? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ugh, what a mess. thanks for clarifying this. |
||
|
||
kwargs = intern_dict({ | ||
name: urllib.unquote(value).decode("UTF-8") if value else value | ||
name: _unquote(value) if value else value | ||
for name, value in group_dict.items() | ||
}) | ||
|
||
|
@@ -327,7 +335,7 @@ def _get_handler_for_request(self, request): | |
# Loop through all the registered callbacks to check if the method | ||
# and path regex match | ||
for path_entry in self.path_regexs.get(request.method, []): | ||
m = path_entry.pattern.match(request.path) | ||
m = path_entry.pattern.match(request.path.decode('ascii')) | ||
if m: | ||
# We found a match! | ||
return path_entry.callback, m.groupdict() | ||
|
@@ -383,7 +391,7 @@ def __init__(self, path): | |
self.url = path | ||
|
||
def render_GET(self, request): | ||
return redirectTo(self.url, request) | ||
return redirectTo(self.url.encode('ascii'), request) | ||
|
||
def getChild(self, name, request): | ||
if len(name) == 0: | ||
|
@@ -404,12 +412,14 @@ def respond_with_json(request, code, json_object, send_cors=False, | |
return | ||
|
||
if pretty_print: | ||
json_bytes = encode_pretty_printed_json(json_object) + "\n" | ||
json_bytes = (encode_pretty_printed_json(json_object) + "\n" | ||
).encode("utf-8") | ||
else: | ||
if canonical_json or synapse.events.USE_FROZEN_DICTS: | ||
# canonicaljson already encodes to bytes | ||
json_bytes = encode_canonical_json(json_object) | ||
else: | ||
json_bytes = json.dumps(json_object) | ||
json_bytes = json.dumps(json_object).encode("utf-8") | ||
|
||
return respond_with_json_bytes( | ||
request, code, json_bytes, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -175,6 +175,7 @@ def on_POST(self, request): | |
from synapse.rest.client.v2_alpha.register import RegisterRestServlet | ||
register = RegisterRestServlet(self.hs) | ||
|
||
password = password.decode('utf-8') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if there's a reason for doing |
||
(user_id, _) = yield register.registration_handler.register( | ||
localpart=username.lower(), password=password, admin=bool(admin), | ||
generate_token=False, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
# limitations under the License. | ||
import logging | ||
|
||
from six import PY2 | ||
from six.moves import http_client | ||
|
||
from twisted.internet import defer | ||
|
@@ -165,6 +166,9 @@ def on_POST(self, request): | |
assert_params_in_dict(params, ["new_password"]) | ||
new_password = params['new_password'] | ||
|
||
if PY2: | ||
new_password = new_password.decode('utf8') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. again, why not |
||
|
||
yield self._set_password_handler.set_password( | ||
user_id, new_password, requester | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,15 +193,15 @@ def __init__(self, hs): | |
def on_POST(self, request): | ||
body = parse_json_object_from_request(request) | ||
|
||
kind = "user" | ||
if "kind" in request.args: | ||
kind = request.args["kind"][0] | ||
kind = b"user" | ||
if b"kind" in request.args: | ||
kind = request.args[b"kind"][0] | ||
|
||
if kind == "guest": | ||
if kind == b"guest": | ||
ret = yield self._do_guest_registration(body) | ||
defer.returnValue(ret) | ||
return | ||
elif kind != "user": | ||
elif kind != b"user": | ||
raise UnrecognizedRequestError( | ||
"Do not understand membership kind: %s" % (kind,) | ||
) | ||
|
@@ -389,8 +389,13 @@ def on_POST(self, request): | |
assert_params_in_dict(params, ["password"]) | ||
|
||
desired_username = params.get("username", None) | ||
new_password = params.get("password", None) | ||
guest_access_token = params.get("guest_access_token", None) | ||
new_password = params.get("password", None) | ||
|
||
if new_password and isinstance(new_password, bytes): | ||
# We may not need to decode the password, if it came from the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this smells wrong. Surely we should be storing the same type in the session as is in the original params? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also, haven't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The original params are |
||
# session -- we've decoded it before then. | ||
new_password = new_password.decode('utf8') | ||
|
||
if desired_username is not None: | ||
desired_username = desired_username.lower() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,7 @@ | |
from collections import OrderedDict, deque, namedtuple | ||
from functools import wraps | ||
|
||
from six import iteritems | ||
from six import PY3, iteritems | ||
from six.moves import range | ||
|
||
from canonicaljson import json | ||
|
@@ -65,7 +65,10 @@ | |
|
||
|
||
def encode_json(json_object): | ||
return frozendict_json_encoder.encode(json_object) | ||
if PY3: | ||
return frozendict_json_encoder.encode(json_object) | ||
else: | ||
return frozendict_json_encoder.encode(json_object).decode("utf-8") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we only do this for py2? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. json/simplejson returns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (also the decodes in the functions below have been moved to this one spot) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok comments please to explain this. |
||
|
||
|
||
class _EventPeristenceQueue(object): | ||
|
@@ -1054,7 +1057,7 @@ def _update_outliers_txn(self, txn, events_and_contexts): | |
|
||
metadata_json = encode_json( | ||
event.internal_metadata.get_dict() | ||
).decode("UTF-8") | ||
) | ||
|
||
sql = ( | ||
"UPDATE event_json SET internal_metadata = ?" | ||
|
@@ -1168,8 +1171,8 @@ def event_dict(event): | |
"room_id": event.room_id, | ||
"internal_metadata": encode_json( | ||
event.internal_metadata.get_dict() | ||
).decode("UTF-8"), | ||
"json": encode_json(event_dict(event)).decode("UTF-8"), | ||
), | ||
"json": encode_json(event_dict(event)), | ||
} | ||
for event, _ in events_and_contexts | ||
], | ||
|
@@ -1188,7 +1191,7 @@ def event_dict(event): | |
"type": event.type, | ||
"processed": True, | ||
"outlier": event.internal_metadata.is_outlier(), | ||
"content": encode_json(event.content).decode("UTF-8"), | ||
"content": encode_json(event.content), | ||
"origin_server_ts": int(event.origin_server_ts), | ||
"received_ts": self._clock.time_msec(), | ||
"sender": event.sender, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,15 +74,21 @@ def _get_event_reference_hashes_txn(self, txn, event_id): | |
txn (cursor): | ||
event_id (str): Id for the Event. | ||
Returns: | ||
A dict of algorithm -> hash. | ||
A dict[str, bytes] of algorithm -> hash. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/str/unicode/ ? |
||
""" | ||
query = ( | ||
"SELECT algorithm, hash" | ||
" FROM event_reference_hashes" | ||
" WHERE event_id = ?" | ||
) | ||
txn.execute(query, (event_id, )) | ||
return {k: v for k, v in txn} | ||
if six.PY2: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we not use the PY3 path under py2 as well? |
||
return {k: v for k, v in txn} | ||
else: | ||
done = {} | ||
for k, v in txn: | ||
done[k] = v.encode('ascii') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fwiw I'd find a dict comprehension clearer here:
|
||
return done | ||
|
||
|
||
class SignatureStore(SignatureWorkerStore): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this necessary? surely login_submission is coming from the json body so should already have been un-utf8'ed?
(and if it is necessary, why don't we do it on PY3?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(should it not just be
password = unicode(password)
?)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
json.loads() does not do any decoding, it returns a dict with str keys/value, so bytes on Python 2 and unicode on Py3. We then need to decode it to utf8 to get the Unicode which we then need to pass to
unicodedata.normalize
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unicode(password)
also will decode it with ASCII, as it's forced decoding.