-
Notifications
You must be signed in to change notification settings - Fork 0
/
codecutil.py
91 lines (75 loc) · 2.23 KB
/
codecutil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import codecs
try:
chr(0x10000)
except ValueError:
# narrow python build
UCSCHAR = [
(0xA0, 0xD7FF),
(0xF900, 0xFDCF),
(0xFDF0, 0xFFEF),
]
IPRIVATE = [
(0xE000, 0xF8FF),
]
else:
UCSCHAR = [
(0xA0, 0xD7FF),
(0xF900, 0xFDCF),
(0xFDF0, 0xFFEF),
(0x10000, 0x1FFFD),
(0x20000, 0x2FFFD),
(0x30000, 0x3FFFD),
(0x40000, 0x4FFFD),
(0x50000, 0x5FFFD),
(0x60000, 0x6FFFD),
(0x70000, 0x7FFFD),
(0x80000, 0x8FFFD),
(0x90000, 0x9FFFD),
(0xA0000, 0xAFFFD),
(0xB0000, 0xBFFFD),
(0xC0000, 0xCFFFD),
(0xD0000, 0xDFFFD),
(0xE1000, 0xEFFFD),
]
IPRIVATE = [
(0xE000, 0xF8FF),
(0xF0000, 0xFFFFD),
(0x100000, 0x10FFFD),
]
_ESCAPE_RANGES = UCSCHAR + IPRIVATE
def _in_escape_range(octet):
for start, end in _ESCAPE_RANGES:
if start <= octet <= end:
return True
return False
def _starts_surrogate_pair(character):
char_value = ord(character)
return 0xD800 <= char_value <= 0xDBFF
def _ends_surrogate_pair(character):
char_value = ord(character)
return 0xDC00 <= char_value <= 0xDFFF
def _pct_encoded_replacements(chunk):
replacements = []
chunk_iter = iter(chunk)
for character in chunk_iter:
codepoint = ord(character)
if _in_escape_range(codepoint):
for char in chr(codepoint).encode("utf-8"):
replacements.append("%%%X" % char)
elif _starts_surrogate_pair(character):
next_character = next(chunk_iter)
for char in (character + next_character).encode("utf-8"):
replacements.append("%%%X" % char)
else:
replacements.append(chr(codepoint))
return replacements
def _pct_escape_handler(err):
'''
Encoding error handler that does percent-escaping of Unicode, to be used
with codecs.register_error
TODO: replace use of this with urllib.parse.quote as appropriate
'''
chunk = err.object[err.start:err.end]
replacements = _pct_encoded_replacements(chunk)
return ("".join(replacements), err.end)
codecs.register_error("oid_percent_escape", _pct_escape_handler)