Skip to content

Commit

Permalink
don't try to guess encodings
Browse files Browse the repository at this point in the history
lxml works best with bytes, so encode strings as utf-8 bytes

Closes #76
  • Loading branch information
longhotsummer committed Aug 30, 2023
1 parent a8f50f0 commit f9e3aff
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 6 deletions.
7 changes: 1 addition & 6 deletions cobalt/akn.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,8 @@ class AkomaNtosoDocument:
source = ["cobalt", "cobalt", "https://github.com/laws-africa/cobalt"]

def __init__(self, xml=None):
# TODO: we can do this better
encoding = ENCODING_RE.search(xml, 0, 200)
if encoding:
# lxml doesn't like unicode strings with an encoding element, so
# change to bytes
if isinstance(xml, str):
xml = xml.encode('utf-8')

self.parse(xml)
self.maker = objectify.ElementMaker(annotate=False, namespace=self.namespace, nsmap=self.root.nsmap)

Expand Down
36 changes: 36 additions & 0 deletions tests/test_structured_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,42 @@ def test_parser(self):
</act>
</akomaNtoso>""", a.document_type)

def test_unicode(self):
# string, no encoding
a = Act("""<akomaNtoso xmlns="http://docs.oasis-open.org/legaldocml/ns/akn/3.0">
<act>
<meta/>
<body>😀</body>
</act>
</akomaNtoso>""")
self.assertEqual(a.root.xpath("//a:body", namespaces={'a': a.namespace})[0].text, "😀")

# bytes, no encoding
a = Act("""<akomaNtoso xmlns="http://docs.oasis-open.org/legaldocml/ns/akn/3.0">
<act>
<meta/>
<body>😀</body>
</act>
</akomaNtoso>""".encode('utf-8'))
self.assertEqual(a.root.xpath("//a:body", namespaces={'a': a.namespace})[0].text, "😀")

# with encoding attribute, bytes
a = Act("""<?xml version="1.0" encoding="utf-8"?><akomaNtoso xmlns="http://docs.oasis-open.org/legaldocml/ns/akn/3.0">
<act>
<meta/>
<body>😀</body>
</act>
</akomaNtoso>""".encode('utf-8'))
self.assertEqual(a.root.xpath("//a:body", namespaces={'a': a.namespace})[0].text, "😀")

# with encoding string
Act("""<?xml version="1.0" encoding="utf-8"?><akomaNtoso xmlns="http://docs.oasis-open.org/legaldocml/ns/akn/3.0">
<act>
<meta/>
<body>😀</body>
</act>
</akomaNtoso>""")

def test_add_number(self):
""" When adding an FRBRnumber element to a document that doesn't already have one, it
must come after subtype.
Expand Down

0 comments on commit f9e3aff

Please sign in to comment.