-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Python 3: Convert some unicode/bytes uses #3569
Changes from 22 commits
cb6b689
8801cb8
e729bdf
a14df28
ede1ace
f8172ef
df8f3e3
e0bf614
f0a00f0
35a41ab
4831ead
f04033e
e1bdb58
521a920
58df4a0
f152316
9fc33fd
6040be4
6745999
ed08bcb
da3502a
616864e
e876bcd
d5b735e
3cc58ea
b3a8de6
bfe288c
df8c45a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Unicode passwords are now normalised before hashing, preventing the instance where two different devices or browsers might send a different UTF-8 sequence for the password. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,7 @@ | |
# limitations under the License. | ||
|
||
import logging | ||
import unicodedata | ||
|
||
import attr | ||
import bcrypt | ||
|
@@ -626,6 +627,7 @@ def validate_login(self, username, login_submission): | |
# special case to check for "password" for the check_password interface | ||
# for the auth providers | ||
password = login_submission.get("password") | ||
|
||
if login_type == LoginType.PASSWORD: | ||
if not self._password_enabled: | ||
raise SynapseError(400, "Password login has been disabled.") | ||
|
@@ -708,6 +710,7 @@ def _check_local_password(self, user_id, password): | |
|
||
Args: | ||
user_id (str): complete @user:id | ||
password (unicode): the provided password | ||
Returns: | ||
(str) the canonical_user_id, or None if unknown user / bad password | ||
""" | ||
|
@@ -849,14 +852,19 @@ def hash(self, password): | |
"""Computes a secure hash of password. | ||
|
||
Args: | ||
password (str): Password to hash. | ||
password (unicode): Password to hash. | ||
|
||
Returns: | ||
Deferred(str): Hashed password. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/str/bytes/ ? |
||
""" | ||
def _do_hash(): | ||
return bcrypt.hashpw(password.encode('utf8') + self.hs.config.password_pepper, | ||
bcrypt.gensalt(self.bcrypt_rounds)) | ||
# Normalise the Unicode in the password | ||
pw = unicodedata.normalize("NFKC", password) | ||
|
||
return bcrypt.hashpw( | ||
pw.encode('utf8') + self.hs.config.password_pepper.encode("utf8"), | ||
bcrypt.gensalt(self.bcrypt_rounds), | ||
) | ||
|
||
return make_deferred_yieldable( | ||
threads.deferToThreadPool( | ||
|
@@ -868,16 +876,19 @@ def validate_hash(self, password, stored_hash): | |
"""Validates that self.hash(password) == stored_hash. | ||
|
||
Args: | ||
password (str): Password to hash. | ||
password (unicode): Password to hash. | ||
stored_hash (str): Expected hash value. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/str/bytes/ ? |
||
|
||
Returns: | ||
Deferred(bool): Whether self.hash(password) == stored_hash. | ||
""" | ||
|
||
def _do_validate_hash(): | ||
# Normalise the Unicode in the password | ||
pw = unicodedata.normalize("NFKC", password) | ||
|
||
return bcrypt.checkpw( | ||
password.encode('utf8') + self.hs.config.password_pepper, | ||
pw.encode('utf8') + self.hs.config.password_pepper.encode("utf8"), | ||
stored_hash.encode('utf8') | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,12 +13,13 @@ | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import cgi | ||
import collections | ||
import logging | ||
import urllib | ||
|
||
from six.moves import http_client | ||
from six import PY3 | ||
from six.moves import http_client, urllib | ||
|
||
from canonicaljson import encode_canonical_json, encode_pretty_printed_json, json | ||
|
||
|
@@ -264,6 +265,7 @@ def __init__(self, hs, canonical_json=True): | |
self.hs = hs | ||
|
||
def register_paths(self, method, path_patterns, callback): | ||
method = method.encode("utf-8") # method is bytes on py3 | ||
for path_pattern in path_patterns: | ||
logger.debug("Registering for %s %s", method, path_pattern.pattern) | ||
self.path_regexs.setdefault(method, []).append( | ||
|
@@ -296,8 +298,14 @@ def _async_render(self, request): | |
# here. If it throws an exception, that is handled by the wrapper | ||
# installed by @request_handler. | ||
|
||
def _unquote(s): | ||
if PY3: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why don't we decode when we have PY3? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the URL encoded sequence string "%E2%98%83", encoded as a unicode string: Python2:
This is wrong, it returns Unicode but the ASCII escaped character codes. Python 2:
Correct, returns the Unicode literal, but escaped for display as Py2 will usually not print real Unicode characters by itself. Python 3:
Correct, returns the Unicode literal (not escaped, as Python 3 has the correct terminal encoding set up). |
||
return urllib.parse.unquote(s) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we not utf-8 decode here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's already decoded by |
||
else: | ||
return urllib.parse.unquote(s.encode('ascii')).decode('utf8') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. URLs are 7-bit ASCII. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes they are, but that's not the point. the question is why we do the encoding here and not for python3. Likewise, I still don't understand why we decode the result on py2 and not on python3. Is it that whatever the answer is, could we have some explanation in comments? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ugh, what a mess. thanks for clarifying this. |
||
|
||
kwargs = intern_dict({ | ||
name: urllib.unquote(value).decode("UTF-8") if value else value | ||
name: _unquote(value) if value else value | ||
for name, value in group_dict.items() | ||
}) | ||
|
||
|
@@ -327,7 +335,7 @@ def _get_handler_for_request(self, request): | |
# Loop through all the registered callbacks to check if the method | ||
# and path regex match | ||
for path_entry in self.path_regexs.get(request.method, []): | ||
m = path_entry.pattern.match(request.path) | ||
m = path_entry.pattern.match(request.path.decode('ascii')) | ||
if m: | ||
# We found a match! | ||
return path_entry.callback, m.groupdict() | ||
|
@@ -383,7 +391,7 @@ def __init__(self, path): | |
self.url = path | ||
|
||
def render_GET(self, request): | ||
return redirectTo(self.url, request) | ||
return redirectTo(self.url.encode('ascii'), request) | ||
|
||
def getChild(self, name, request): | ||
if len(name) == 0: | ||
|
@@ -404,12 +412,14 @@ def respond_with_json(request, code, json_object, send_cors=False, | |
return | ||
|
||
if pretty_print: | ||
json_bytes = encode_pretty_printed_json(json_object) + "\n" | ||
json_bytes = (encode_pretty_printed_json(json_object) + "\n" | ||
).encode("utf-8") | ||
else: | ||
if canonical_json or synapse.events.USE_FROZEN_DICTS: | ||
# canonicaljson already encodes to bytes | ||
json_bytes = encode_canonical_json(json_object) | ||
else: | ||
json_bytes = json.dumps(json_object) | ||
json_bytes = json.dumps(json_object).encode("utf-8") | ||
|
||
return respond_with_json_bytes( | ||
request, code, json_bytes, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,15 +74,21 @@ def _get_event_reference_hashes_txn(self, txn, event_id): | |
txn (cursor): | ||
event_id (str): Id for the Event. | ||
Returns: | ||
A dict of algorithm -> hash. | ||
A dict[str, bytes] of algorithm -> hash. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. s/str/unicode/ ? |
||
""" | ||
query = ( | ||
"SELECT algorithm, hash" | ||
" FROM event_reference_hashes" | ||
" WHERE event_id = ?" | ||
) | ||
txn.execute(query, (event_id, )) | ||
return {k: v for k, v in txn} | ||
if six.PY2: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we not use the PY3 path under py2 as well? |
||
return {k: v for k, v in txn} | ||
else: | ||
done = {} | ||
for k, v in txn: | ||
done[k] = v.encode('ascii') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fwiw I'd find a dict comprehension clearer here:
|
||
return done | ||
|
||
|
||
class SignatureStore(SignatureWorkerStore): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it might be worth clarifying that it will be a
str
on python3. Something like:(and similar on
validate_hash
etc etc)Of course what's really happening is that elsewhere we have been sloppy about saying
str
when we meanstr|unicode
, though I don't suggest we change that...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's clearer to use
bytes
(flat bytes),str
(str on either platform), orunicode
(unicode), even though there is no such thing asunicode
on Python 3.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, as discussed,
unicode
seems the right answer here. However, the difference withuser_id
and the return type is now very striking; suggest you update them too.