Skip to content

Commit

Permalink
add test cases for non-ascii
Browse files Browse the repository at this point in the history
  • Loading branch information
Yifan Peng committed Jun 22, 2018
1 parent 258f18b commit 244948f
Show file tree
Hide file tree
Showing 11 changed files with 30 additions and 31 deletions.
1 change: 1 addition & 0 deletions bioc/decoder.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def decode(self, fp):
Returns:
BioCCollection: a object of BioCollection
"""
# utf8_parser = etree.XMLParser(encoding='utf-8')
tree = etree.parse(fp)
collection = self.__parse_collection(tree.getroot())
collection.encoding = tree.docinfo.encoding
Expand Down
4 changes: 2 additions & 2 deletions tests/bioc/everything.json
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,14 @@
"infons": {
"sentence-infon-key": "sentence-infon-value"
},
"text": "hijklm",
"text": "测试Non-ASCII",
"annotations": [
{
"id": "4",
"infons": {
"annotation-infon-key": "annotation-infon-value"
},
"text": "hi",
"text": "测试",
"locations": [
{
"offset": 34,
Expand Down
2 changes: 0 additions & 2 deletions tests/bioc/everything.jsonl

This file was deleted.

4 changes: 2 additions & 2 deletions tests/bioc/everything.xml
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@
<sentence>
<infon key="sentence-infon-key">sentence-infon-value</infon>
<offset>34</offset>
<text>hijklm</text>
<text>测试Non-ASCII</text>
<annotation id="4">
<infon key="annotation-infon-key">annotation-infon-value</infon>
<location offset="34" length="2"/>
<text>hi</text>
<text>测试</text>
</annotation>
</sentence>
</passage>
Expand Down
14 changes: 7 additions & 7 deletions tests/bioc/test_bioc.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -9,39 +9,39 @@


def test_load():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.load(fp)
assert_everything(collection)


def test_loads():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
s = fp.read()
collection = bioc.loads(s)
assert_everything(collection)


def test_dump():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.load(fp)
tmp = tempfile.NamedTemporaryFile()
with open(tmp.name, 'w') as fp:
with open(tmp.name, 'w', encoding='utf8') as fp:
bioc.dump(collection, fp)
with open(tmp.name) as fp:
with open(tmp.name, encoding='utf8') as fp:
collection = bioc.load(fp)
assert_everything(collection)


def test_dumps():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.load(fp)
s = bioc.dumps(collection)
collection = bioc.loads(s)
assert_everything(collection)


def test_validate():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.load(fp)
bioc.validate(collection)

Expand Down
6 changes: 3 additions & 3 deletions tests/bioc/test_iterator.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def test_sentences():
filename = os.path.join(os.path.dirname(__file__), 'everything.xml')
with open(filename) as fp:
with open(filename, encoding='utf8') as fp:
collection = bioc.load(fp)

sentences = list(bioc.sentences(collection))
Expand All @@ -20,7 +20,7 @@ def test_sentences():

def test_annotations():
filename = os.path.join(os.path.dirname(__file__), 'everything.xml')
with open(filename) as fp:
with open(filename, encoding='utf8') as fp:
collection = bioc.load(fp)

annotations = list(bioc.annotations(collection))
Expand All @@ -39,7 +39,7 @@ def test_annotations():

def test_relations():
filename = os.path.join(os.path.dirname(__file__), 'everything.xml')
with open(filename) as fp:
with open(filename, encoding='utf8') as fp:
collection = bioc.load(fp)

relations = list(bioc.relations(collection))
Expand Down
4 changes: 2 additions & 2 deletions tests/bioc/test_iterwrite.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@

def test_iterwrite():
src = os.path.join(os.path.dirname(__file__), 'everything.xml')
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.load(fp)

tmp = tempfile.NamedTemporaryFile()
with bioc.iterwrite(tmp.name, collection) as writer:
for document in collection.documents:
writer.writedocument(document)

with open(tmp.name) as fp:
with open(tmp.name, encoding='utf8') as fp:
collection = bioc.load(fp)

assert_everything(collection)
12 changes: 6 additions & 6 deletions tests/bioc/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,31 +8,31 @@


def test_jsonload():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.jsonload(fp)
assert_everything(collection)


def test_jsonloads():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
s = fp.read()
collection = bioc.jsonloads(s)
assert_everything(collection)


def test_jsondump():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.jsonload(fp)
tmp = tempfile.NamedTemporaryFile()
with open(tmp.name, 'w') as fp:
with open(tmp.name, 'w', encoding='utf8') as fp:
bioc.jsondump(collection, fp)
with open(tmp.name) as fp:
with open(tmp.name, encoding='utf8') as fp:
collection = bioc.jsonload(fp)
assert_everything(collection)


def test_jsondumps():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.jsonload(fp)
s = bioc.jsondumps(collection)
collection = bioc.jsonloads(s)
Expand Down
4 changes: 2 additions & 2 deletions tests/bioc/test_jsonlines.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@


def test_jsonlines():
with open(src) as fp:
with open(src, encoding='utf8') as fp:
collection = bioc.jsonload(fp)

tmp = tempfile.NamedTemporaryFile()
with jsonlines.open(tmp.name, 'w') as writer:
with jsonlines.open(tmp.name, mode='w') as writer:
for doc in collection.documents:
writer.write(BioCJSONEncoder().default(doc))

Expand Down
6 changes: 3 additions & 3 deletions tests/bioc/test_utils.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

def test_get_text():
filename = os.path.join(os.path.dirname(__file__), 'everything.xml')
with open(filename) as fp:
with open(filename, encoding='utf8') as fp:
collection = bioc.load(fp)

assert (0, 'abcdefghijklmnopqrstuvwxyz') == bioc.get_text(collection.documents[0])
assert (27, 'abcdefghijklm') == bioc.get_text(collection.documents[1].passages[0])
assert (0, '\n'*27 + 'abcdefghijklm') == bioc.get_text(collection.documents[1])
assert (27, 'abcdefg测试Non-ASCII') == bioc.get_text(collection.documents[1].passages[0])
assert (0, '\n'*27 + 'abcdefg测试Non-ASCII') == bioc.get_text(collection.documents[1])

with pytest.raises(ValueError):
next(bioc.get_text('Foo'))
4 changes: 2 additions & 2 deletions tests/bioc/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def assert_everything(collection):
sentence = passage.sentences[1]
assert 34 == sentence.offset
assert 'sentence-infon-value' == sentence.infons['sentence-infon-key']
assert 'hijklm' == sentence.text
assert '测试Non-ASCII' == sentence.text
annotation = passage.sentences[0].annotations[0]
assert '3' == annotation.id
assert 'annotation-infon-value' == annotation.infons['annotation-infon-key']
Expand All @@ -57,7 +57,7 @@ def assert_everything(collection):
annotation = passage.sentences[1].annotations[0]
assert '4' == annotation.id
assert 'annotation-infon-value' == annotation.infons['annotation-infon-key']
assert 'hi' == annotation.text
assert '测试' == annotation.text
assert 34 == annotation.total_span.offset
assert 2 == annotation.total_span.length
relation = passage.sentences[0].relations[0]
Expand Down

0 comments on commit 244948f

Please sign in to comment.