diff --git a/collatex-pythonport/collatex/__init__.py b/collatex-pythonport/collatex/__init__.py
index 72cd2d045..a277707f2 100755
--- a/collatex-pythonport/collatex/__init__.py
+++ b/collatex-pythonport/collatex/__init__.py
@@ -7,8 +7,7 @@
from collatex.core_functions import Collation
from collatex.core_functions import collate
-from collatex.core_functions import collate_pretokenized_json
-__all__ = ["Collation", "collate", "collate_pretokenized_json"]
+__all__ = ["Collation", "collate"]
diff --git a/collatex-pythonport/collatex/collatex_suffix.py b/collatex-pythonport/collatex/collatex_suffix.py
index 4cd5ab4ee..713f7f3ac 100644
--- a/collatex-pythonport/collatex/collatex_suffix.py
+++ b/collatex-pythonport/collatex/collatex_suffix.py
@@ -188,7 +188,7 @@ def __init__(self, occurrences, tokens):
def debug(self):
result = []
for occurrence in self.occurrences:
- result.append(' '.join(self.tokens[occurrence.token_range.slices().next()]))
+ result.append(' '.join(self.tokens[next(occurrence.token_range.slices())]))
return result
diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py
index 6d62f2a06..9f1d21d92 100644
--- a/collatex-pythonport/collatex/core_classes.py
+++ b/collatex-pythonport/collatex/core_classes.py
@@ -13,7 +13,7 @@
import re
from prettytable import PrettyTable
from textwrap import fill
-from collatex.exceptions import TokenError
+from collatex.exceptions import TokenError, UnsupportedError
class Row(object):
@@ -161,20 +161,20 @@ def __repr__(self):
class Witness(object):
def __init__(self, witnessdata):
+ if 'id' not in witnessdata:
+ raise UnsupportedError("No defined id in witnessdata")
self.sigil = witnessdata['id']
self._tokens = []
if 'content' in witnessdata:
- self.content = witnessdata['content']
- # print("Witness "+sigil+" TOKENIZER IS CALLED!")
tokenizer = WordPunctuationTokenizer()
- tokens_as_strings = tokenizer.tokenize(self.content)
+ tokens_as_strings = tokenizer.tokenize(witnessdata['content'])
for token_string in tokens_as_strings:
self._tokens.append(Token({'t':token_string}))
elif 'tokens' in witnessdata:
for tk in witnessdata['tokens']:
self._tokens.append(Token(tk))
- # TODO no idea what this content string is needed for.
- self.content = ' '.join([x.token_string for x in self._tokens])
+ else:
+ raise UnsupportedError("No defined content/tokens in witness "+self.sigil)
def tokens(self):
return self._tokens
diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
index c2d2d4709..318e0437f 100644
--- a/collatex-pythonport/collatex/core_functions.py
+++ b/collatex-pythonport/collatex/core_functions.py
@@ -27,10 +27,21 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
# check which output format is requested: graph or table
if output=="graph":
return graph
+
# create alignment table
table = AlignmentTable(collation, graph, layout)
+ if collation.pretokenized and not segmentation:
+ token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses]
+ # only with segmentation=False
+ # there could be a different comportment of get_tokenized_table if semgentation=True
+ table = get_tokenized_at(table, token_list, segmentation=segmentation, layout=layout)
+ # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict)
+ if output=="table" or output=="html":
+ for row in table.rows:
+ row.cells = [cell["t"] for cell in row.cells]
+
if output == "json":
- return export_alignment_table_as_json(table)
+ return export_alignment_table_as_json(table, layout=layout)
if output == "html":
return display_alignment_table_as_HTML(table)
if output == "table":
@@ -38,48 +49,37 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
else:
raise Exception("Unknown output type: "+output)
-
-
-#TODO: this only works with a table output at the moment
-#TODO: store the tokens on the graph instead
-def collate_pretokenized_json(json, output='table', layout='horizontal', **kwargs):
- # Takes more or less the same arguments as collate() above, but with some restrictions.
- # Only output types 'json' and 'table' are supported.
- if output not in ['json', 'table']:
- raise UnsupportedError("Output type" + kwargs['output'] + "not supported for pretokenized collation")
- if 'segmentation' in kwargs and kwargs['segmentation']:
- raise UnsupportedError("Segmented output not supported for pretokenized collation")
- kwargs['segmentation'] = False
-
- # For each witness given, make a 'shadow' witness based on the normalization tokens
- # that will actually be collated.
- tokenized_witnesses = []
- collation = Collation()
- for witness in json["witnesses"]:
- collation.add_witness(witness)
- tokenized_witnesses.append(witness["tokens"])
- at = collate(collation, output="table", **kwargs)
- tokenized_at = AlignmentTable(collation, layout=layout)
- for row, tokenized_witness in zip(at.rows, tokenized_witnesses):
- new_row = Row(row.header)
+def get_tokenized_at(table, token_list, segmentation=False, layout="horizontal"):
+ tokenized_at = AlignmentTable(Collation(), layout=layout)
+ for witness_row, witness_tokens in zip(table.rows, token_list):
+ new_row = Row(witness_row.header)
tokenized_at.rows.append(new_row)
- token_counter = 0
- for cell in row.cells:
- if cell != "-":
- new_row.cells.append(tokenized_witness[token_counter])
- token_counter+=1
- else:
- #TODO: should probably be null or None instead, but that would break the rendering at the moment
- new_row.cells.append({"t":"-"})
- if output=="json":
- return export_alignment_table_as_json(tokenized_at)
- if output=="table":
- # transform JSON objects to "t" form.
- for row in tokenized_at.rows:
- row.cells = [cell["t"] for cell in row.cells]
- return tokenized_at
-
-def export_alignment_table_as_json(table, indent=None, status=False):
+ counter = 0
+ for cell in witness_row.cells:
+ if cell == "-":
+ # TODO: should probably be null or None instead, but that would break the rendering at the moment (line 41)
+ new_row.cells.append({"t" : "-"})
+ # if segmentation=False
+ else:
+ new_row.cells.append(witness_tokens[counter])
+ counter+=1
+ # else if segmentation=True
+ ##token_list must be a list of Token instead of list of dict (update lines 34, 64)
+ ##line 41 will not be happy in case of table/html output
+ #string = witness_tokens[counter].token_string
+ #token_counter = 1
+ #while string != cell :
+ # if counter+token_counter-1 < len(witness_tokens)-1:
+ # #add token_string of the next token until it is equivalent to the string in the cell
+ # #if we are not at the last token
+ # string += ' '+witness_tokens[counter+token_counter].token_string
+ # token_counter += 1
+ ##there is one list level too many in the output
+ #new_row.cells.append([tk.token_data for tk in witness_tokens[counter:counter+token_counter]])
+ #counter += token_counter.
+ return tokenized_at
+
+def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"):
json_output = {}
json_output["table"]=[]
sigli = []
@@ -92,6 +92,9 @@ def export_alignment_table_as_json(table, indent=None, status=False):
for column in table.columns:
variant_status.append(column.variant)
json_output["status"]=variant_status
+ if layout=="vertical":
+ new_table = [[row[i] for row in json_output["table"]] for i in range(len(row.cells))]
+ json_output["table"] = new_table
return json.dumps(json_output, sort_keys=True, indent=indent)
'''
@@ -101,29 +104,40 @@ class Collation(object):
@classmethod
def create_from_dict(cls, data, limit=None):
+ if "witnesses" not in data:
+ raise UnsupportedError("Json input not valid")
witnesses = data["witnesses"]
collation = Collation()
for witness in witnesses[:limit]:
# generate collation object from json_data
collation.add_witness(witness)
+ # determine if data is pretokenized
+ if 'tokens' in witness:
+ collation.pretokenized = True
return collation
+ # json input can be a string or a file
+ @classmethod
+ def create_from_json_string(cls, json_string):
+ data = json.loads(json_string)
+ collation = cls.create_from_dict(data)
+ return collation
+
@classmethod
- # json_data can be a string or a file
- def create_from_json(cls, json_data):
- data = json.load(json_data)
+ def create_from_json_file(cls, json_path):
+ with open(json_path, 'r') as json_file:
+ data = json.load(json_file)
collation = cls.create_from_dict(data)
return collation
def __init__(self):
self.witnesses = []
+ self.pretokenized = False
self.counter = 0
self.witness_ranges = {}
- self.combined_string = ""
self.cached_suffix_array = None
+ self.combined_tokens =[]
- # the tokenization process happens multiple times
- # and by different tokenizers. This should be fixed
def add_witness(self, witnessdata):
# clear the suffix array and LCP array cache
self.cached_suffix_array = None
@@ -134,9 +148,11 @@ def add_witness(self, witnessdata):
# the extra one is for the marker token
self.counter += len(witness.tokens()) +2 # $ + number
self.witness_ranges[witness.sigil] = witness_range
- if not self.combined_string == "":
- self.combined_string += " $"+str(len(self.witnesses)-1)+ " "
- self.combined_string += witness.content
+ if len(self.witnesses) > 1:
+ self.combined_tokens.append('$')
+ self.combined_tokens.append(str(len(self.witnesses)-1))
+ for tk in witness.tokens():
+ self.combined_tokens.append(tk.token_string)
def add_plain_witness(self, sigil, content):
return self.add_witness({'id':sigil, 'content':content})
@@ -146,14 +162,11 @@ def get_range_for_witness(self, witness_sigil):
raise Exception("Witness "+witness_sigil+" is not added to the collation!")
return self.witness_ranges[witness_sigil]
- def get_combined_string(self):
- return self.combined_string
-
def get_sa(self):
#NOTE: implemented in a lazy manner, since calculation of the Suffix Array and LCP Array takes time
if not self.cached_suffix_array:
# Unit byte is done to skip tokenization in third party library
- self.cached_suffix_array = SuffixArray(self.tokens, unit=UNIT_BYTE)
+ self.cached_suffix_array = SuffixArray(self.combined_tokens, unit=UNIT_BYTE)
return self.cached_suffix_array
def get_suffix_array(self):
@@ -164,17 +177,7 @@ def get_lcp_array(self):
sa = self.get_sa()
return sa._LCP_values
-
def to_extended_suffix_array(self):
- return ExtendedSuffixArray(self.tokens, self.get_suffix_array(), self.get_lcp_array())
-
- @property
- def tokens(self):
- #print("COLLATION TOKENIZE IS CALLED!")
- #TODO: complete set of witnesses is retokenized here!
- tokenizer = WordPunctuationTokenizer()
- tokens = tokenizer.tokenize(self.get_combined_string())
- return tokens
-
+ return ExtendedSuffixArray(self.combined_tokens, self.get_suffix_array(), self.get_lcp_array())
diff --git a/collatex-pythonport/collatex/suffix_based_scorer.py b/collatex-pythonport/collatex/suffix_based_scorer.py
index cfb2713f2..27c8b468e 100644
--- a/collatex-pythonport/collatex/suffix_based_scorer.py
+++ b/collatex-pythonport/collatex/suffix_based_scorer.py
@@ -147,7 +147,7 @@ def _get_block_witness(self, witness):
occurrences.append(occurrence)
# sort occurrences on position
sorted_o = sorted(occurrences, key=attrgetter('lower_end'))
- block_witness = BlockWitness(sorted_o, self.collation.tokens)
+ block_witness = BlockWitness(sorted_o, self.collation.combined_tokens)
return block_witness
'''
diff --git a/collatex-pythonport/tests/test_collate_outputs.py b/collatex-pythonport/tests/test_collate_outputs.py
new file mode 100644
index 000000000..cb9c8e5dd
--- /dev/null
+++ b/collatex-pythonport/tests/test_collate_outputs.py
@@ -0,0 +1,275 @@
+'''
+Created on March 24, 2015
+
+@author: Elisa Nury
+'''
+
+import unittest
+from collatex.core_functions import *
+from collatex.exceptions import UnsupportedError
+
+class TestCollate(unittest.TestCase):
+ def test_collate_with_invalid_output(self):
+ data = {"witnesses" :
+ [
+ {"id" : "A", "tokens" :
+ [
+ {"t": "A", "id": 1},
+ {"t": "small"},
+ {"t": "black"},
+ {"t": "cat"}
+ ]
+ },
+ {"id" : "B", "tokens" :
+ [
+ {"t": "A"},
+ {"t": "small"},
+ {"t": "white"},
+ {"t": "kitten.", "n": "cat"}
+ ]
+ }
+ ]
+ }
+ c = Collation.create_from_dict(data)
+ with self.assertRaises(Exception):
+ collate(c, output="xyz")
+
+ def test_collate_with_empty_collation(self):
+ c = Collation()
+ with self.assertRaises(IndexError):
+ collate(c)
+
+
+class TestTokenizedJsonOutput(unittest.TestCase):
+ def setUp(self):
+ self.data = {"witnesses" :
+ [
+ {"id" : "A", "tokens" :
+ [
+ {"t": "A", "id": 1},
+ {"t": "small"},
+ {"t": "black"},
+ {"t": "cat"}
+ ]
+ },
+ {"id" : "B", "tokens" :
+ [
+ {"t": "A"},
+ {"t": "small"},
+ {"t": "white"},
+ {"t": "kitten.", "n": "cat"}
+ ]
+ }
+ ]
+ }
+ self.c = Collation.create_from_dict(self.data)
+ self.maxDiff = None
+
+ #--------------------------------------------------
+ #JSON output
+ def test_tokenized_output_json_segmentationFalse_layoutHorizontal(self):
+ expected = '{"table": [[[{"id": 1, "t": "A"}], [{"t": "small"}], [{"t": "black"}], [{"t": "cat"}]], [[{"t": "A"}], [{"t": "small"}], [{"t": "white"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}'
+ output = collate(self.c, output="json", segmentation=False, layout="horizontal")
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_json_segmentationFalse_layoutVertical(self):
+ expected = '{"table": [[[{"id": 1, "t": "A"}], [{"t": "A"}]], [[{"t": "small"}], [{"t": "small"}]], [[{"t": "black"}], [{"t": "white"}]], [[{"t": "cat"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}'
+ output = collate(self.c, output="json", segmentation=False, layout="vertical")
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_json_segmentationTrue_layoutHorizontal(self):
+ expected = '{"table": [[["A small"], ["black"], ["cat"]], [["A small"], ["white"], ["cat"]]], "witnesses": ["A", "B"]}'
+ output = collate(self.c, output="json", segmentation=True, layout="horizontal")
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_json_segmentationTrue_layoutVertical(self):
+ expected = '{"table": [[["A small"], ["A small"]], [["black"], ["white"]], [["cat"], ["cat"]]], "witnesses": ["A", "B"]}'
+ output = collate(self.c, output="json", segmentation=True, layout="vertical")
+ self.assertEqual(output, expected)
+
+ #--------------------------------------------------
+ #TABLE output
+
+ def test_tokenized_output_table_segmentationFalse_layoutHorizontal(self):
+ expected = """\
++---+---+-------+-------+---------+
+| A | A | small | black | cat |
+| B | A | small | white | kitten. |
++---+---+-------+-------+---------+"""
+ output = str(collate(self.c, output="table", segmentation=False, layout="horizontal"))
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_table_segmentationFalse_layoutVertical(self):
+ expected = '''\
++-------+---------+
+| A | B |
++-------+---------+
+| A | A |
++-------+---------+
+| small | small |
++-------+---------+
+| black | white |
++-------+---------+
+| cat | kitten. |
++-------+---------+'''
+ output = str(collate(self.c, output="table", segmentation=False, layout="vertical"))
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_table_segmentationTrue_layoutHorizontal(self):
+ expected = """\
++---+---------+-------+-----+
+| A | A small | black | cat |
+| B | A small | white | cat |
++---+---------+-------+-----+"""
+ output = str(collate(self.c, output="table", segmentation=True, layout="horizontal"))
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_table_segmentationTrue_layoutVertical(self):
+ expected = '''\
++---------+---------+
+| A | B |
++---------+---------+
+| A small | A small |
++---------+---------+
+| black | white |
++---------+---------+
+| cat | cat |
++---------+---------+'''
+ output = str(collate(self.c, output="table", segmentation=True, layout="vertical"))
+ self.assertEqual(output, expected)
+
+ #--------------------------------------------------
+ #HTML output
+
+ def test_tokenized_output_html_segmentationFalse_layoutHorizontal(self):
+ expected = '''\
+
+
+ A |
+ A |
+ small |
+ black |
+ cat |
+
+
+ B |
+ A |
+ small |
+ white |
+ kitten. |
+
+
'''
+ output = collate(self.c, output="html", segmentation=False, layout="horizontal")
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_html_segmentationFalse_layoutVertical(self):
+ expected = '''\
+
+
+ A |
+ B |
+
+
+ A |
+ A |
+
+
+ small |
+ small |
+
+
+ black |
+ white |
+
+
+ cat |
+ kitten. |
+
+
'''
+ output = collate(self.c, output="html", segmentation=False, layout="vertical")
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_html_segmentationTrue_layoutHorizontal(self):
+ expected = '''\
+
+
+ A |
+ A small |
+ black |
+ cat |
+
+
+ B |
+ A small |
+ white |
+ cat |
+
+
'''
+ output = collate(self.c, output="html", segmentation=True, layout="horizontal")
+ self.assertEqual(output, expected)
+
+ def test_tokenized_output_html_segmentationTrue_layoutVertical(self):
+ expected = '''\
+
+
+ A |
+ B |
+
+
+ A small |
+ A small |
+
+
+ black |
+ white |
+
+
+ cat |
+ cat |
+
+
'''
+ output = collate(self.c, output="html", segmentation=True, layout="vertical")
+ self.assertEqual(output, expected)
+
+
+
+
+#--------------------------------------------------
+#Empty cells output
+
+class TestOutputEmptyCells(unittest.TestCase):
+ def setUp(self):
+ data = {
+ "witnesses" : [
+ {
+ "id" : "A",
+ "tokens" : [
+ { "t" : "A"},
+ { "t" : "black"},
+ { "t" : "cat"}
+ ]
+ },
+ {
+ "id" : "B",
+ "tokens" : [
+ { "t": "A" },
+ { "t": "kitten.", "n": "cat" }
+ ]
+ }
+ ]
+ }
+ self.c = Collation.create_from_dict(data)
+
+ def test_json_segmentationTrue_output_with_empty_cells(self):
+ expected = '{"table": [[["A"], ["black"], ["cat"]], [["A"], ["-"], ["cat"]]], "witnesses": ["A", "B"]}'
+ output = collate(self.c, output="json")
+ self.assertEqual(output, expected)
+
+ def test_json_segmentationFalse_output_with_empty_cells(self):
+ expected = '{"table": [[[{"t": "A"}], [{"t": "black"}], [{"t": "cat"}]], [[{"t": "A"}], [{"t": "-"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}'
+ output = collate(self.c, output="json", segmentation=False)
+ self.assertEqual(output, expected)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/collatex-pythonport/tests/test_collatex_block_witnesses.py b/collatex-pythonport/tests/test_collatex_block_witnesses.py
index 7cb412822..ee7cc3790 100644
--- a/collatex-pythonport/tests/test_collatex_block_witnesses.py
+++ b/collatex-pythonport/tests/test_collatex_block_witnesses.py
@@ -29,16 +29,15 @@ def test_combined_string_hermans_case(self):
collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
# $ is meant to separate witnesses here
- self.assertEquals("a b c d F g h i ! K ! q r s t $1 a b c d F g h i ! q r s t", collation.get_combined_string())
+ self.assertEquals("a b c d F g h i ! K ! q r s t $ 1 a b c d F g h i ! q r s t", " ".join(collation.combined_tokens))
# test whether the witness->range mapping works
- @unit_disabled
def test_witness_ranges_hermans_case(self):
collation = Collation()
collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
self.assertEquals(RangeSet("0-14"), collation.get_range_for_witness("W1"))
- self.assertEquals(RangeSet("16-28"), collation.get_range_for_witness("W2"))
+ self.assertEquals(RangeSet("17-29"), collation.get_range_for_witness("W2"))
# TODO: re-enable test!
# Note: LCP intervals can overlap
@@ -74,14 +73,13 @@ def test_lcp_child_intervals_hermans_case(self):
_, child_lcp_intervals = collation.get_lcp_intervals()
self.assertFalse(child_lcp_intervals)
- @unit_disabled
def test_non_overlapping_blocks_black_cat(self):
collation = Collation()
collation.add_plain_witness("W1", "the black cat")
collation.add_plain_witness("W2", "the black cat")
algorithm = Scorer(collation)
blocks = algorithm._get_non_overlapping_repeating_blocks()
- block1 = Block(RangeSet("0-2, 4-6"))
+ block1 = Block(RangeSet("0-2, 5-7"))
self.assertEqual([block1], blocks)
#TODO: Fix number of siblings!
@@ -97,17 +95,15 @@ def test_blocks_failing_transposition_use_case_old_algorithm(self):
block3 = Block(RangeSet("2, 8"))
self.assertEqual([block1, block2, block3], blocks)
- @unit_disabled
def test_non_overlapping_blocks_Hermans(self):
collation = Collation()
collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
algorithm = Scorer(collation)
blocks = algorithm._get_non_overlapping_repeating_blocks()
- self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i !
- self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
+ self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i !
+ self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t
- @unit_disabled
def test_blocks_Hermans_case_three_witnesses(self):
collation = Collation()
collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
@@ -115,24 +111,22 @@ def test_blocks_Hermans_case_three_witnesses(self):
collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
algorithm = Scorer(collation)
blocks = algorithm._get_non_overlapping_repeating_blocks()
- self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d
- self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i
- self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t
- self.assertIn(Block(RangeSet("4, 20")), blocks) # F
+ self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d
+ self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i
+ self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t
+ self.assertIn(Block(RangeSet("4, 21")), blocks) # F
# In the new approach nothing should be split
- @unit_disabled
def test_blocks_splitting_token_case(self):
collation = Collation()
collation.add_plain_witness("W1", "a c b c")
collation.add_plain_witness("W2", "a c b")
algorithm = Scorer(collation)
blocks = algorithm._get_non_overlapping_repeating_blocks()
- block1 = Block(RangeSet("0-2, 5-7")) # a c b
+ block1 = Block(RangeSet("0-2, 6-8")) # a c b
self.assertIn(block1, blocks)
- @unit_disabled
def test_block_witnesses_Hermans_case_two_witnesses(self):
collation = Collation()
collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
@@ -143,7 +137,6 @@ def test_block_witnesses_Hermans_case_two_witnesses(self):
block_witness = algorithm._get_block_witness(collation.witnesses[1])
self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
- @unit_disabled
def test_block_witnesses_Hermans_case(self):
collation = Collation()
collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
@@ -241,4 +234,4 @@ def test_filter_potential_blocks(self):
if __name__ == "__main__":
#import sys;sys.argv = ['', 'Test.testName']
- unittest.main()
\ No newline at end of file
+ unittest.main()
diff --git a/collatex-pythonport/tests/test_collation_class.py b/collatex-pythonport/tests/test_collation_class.py
new file mode 100644
index 000000000..fd3d2ac7a
--- /dev/null
+++ b/collatex-pythonport/tests/test_collation_class.py
@@ -0,0 +1,85 @@
+'''
+Created on March 24, 2015
+
+@author: Elisa Nury
+'''
+
+import unittest
+from collatex.core_functions import *
+from collatex.exceptions import UnsupportedError
+from testfixtures import TempDirectory
+import os
+import json
+
+class TestCollationMethods(unittest.TestCase):
+
+ def test_collation_method_create_from_json_file(self):
+ with TempDirectory() as d:
+ #create a temporary file in a temporary directory
+ d.write('testfile.json', b'{"witnesses" : [{"id" : "A", "content" : "The fox."}, {"id" : "B", "content": "The dog"}]}')
+ c = Collation.create_from_json_file(os.path.join(d.path, 'testfile.json'))
+ self.assertEqual(len(c.witnesses), 2)
+
+ def test_collation_create_from_dict(self):
+ data = {"witnesses" : [{"id" : "A", "content" : "The fox."}, {"id" : "B", "content": "The dog"}]}
+ c = Collation.create_from_dict(data)
+ self.assertEqual(len(c.witnesses), 2)
+
+
+class TestCollationFunctions(unittest.TestCase):
+ def setUp(self):
+ data = {
+ 'witnesses' : [
+ {
+ 'id' : 'A',
+ 'content' : 'The cat'
+ },
+ {
+ 'id' : 'B',
+ 'tokens' : [
+ { 't' : 'The'},
+ { 't' : 'kitten'}
+ ]
+ }
+ ]
+ }
+ self.c = Collation.create_from_dict(data)
+
+ def test_collation_function_add_plain_witness(self):
+ self.c.add_plain_witness('C', 'A cat')
+ self.assertEqual(len(self.c.witnesses), 3)
+
+ def test_collation_function_add_witness(self):
+ witnessdata = {'id': 'C', 'tokens': [{ 't' : 'A'},{ 't' : 'cat'}]}
+ self.c.add_witness(witnessdata)
+ self.assertEqual(len(self.c.witnesses), 3)
+
+ @unittest.expectedFailure
+ def test_collation_function_add_witnesses_with_same_id(self):
+ witnessdata1 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'fox'}]}
+ witnessdata2 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'dog'}]}
+ self.c.add_witness(witnessdata1)
+ self.c.add_witness(witnessdata2)
+ self.assertEqual(len(self.c.witnesses), 4)
+
+ #error in the collation result => there should be an exception raised...
+ #json_result = json.loads(collate(self.c, output='json'))
+ #self.assertEqual(json_result['table'][2][1], 'fox')
+ #self.assertEqual(json_result['table'][3][1], 'dog')
+ self.fail("It should not be possible to add 2 witnesses with the same id")
+
+ def test_collation_function_get_range_for_witness(self):
+ expected_range_B = RangeSet()
+ expected_range_B.add_range(4, 6)
+ self.assertEqual(self.c.get_range_for_witness('B'), expected_range_B)
+ self.assertRaises(Exception, self.c.get_range_for_witness, 'W')
+
+ #test other functions?
+ #get suffix array
+ #get sa
+ #get lcp array
+ #to extended suffix array
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/collatex-pythonport/tests/test_near_matching_pretokenized.py b/collatex-pythonport/tests/test_near_matching_pretokenized.py
index cad73a67e..7beb7f043 100644
--- a/collatex-pythonport/tests/test_near_matching_pretokenized.py
+++ b/collatex-pythonport/tests/test_near_matching_pretokenized.py
@@ -5,61 +5,63 @@
'''
import unittest
from tests import unit_disabled
-from collatex.core_functions import collate_pretokenized_json
+from collatex.core_functions import *
class Test(unittest.TestCase):
- json_in = {
- "witnesses" : [
- {
- "id" : "A",
- "tokens" : [
- { "t" : "I", "ref" : 123 },
- { "t" : "bought" , "adj" : True },
- { "t" : "this", "id" : "x3" },
- { "t" : "glass", "id" : "x4" },
- { "t" : ",", "type" : "punct" },
- { "t" : "because", "id" : "x5" },
- { "t" : "it", "id" : "x6" },
- { "t" : "matches" },
- { "t" : "those", "id" : "x7" },
- { "t" : "dinner", "id" : "x8" },
- { "t" : "plates", "id" : "x9" },
- { "t" : ".", "type" : "punct" }
- ]
- },
- {
- "id" : "B",
- "tokens" : [
- { "t" : "I" },
- { "t" : "bought" , "adj" : True },
- { "t" : "those", "id" : "abc" },
- { "t" : "glasses", "id" : "xyz" },
- { "t" : ".", "type" : "punct" }
- ]
+ def setUp(self):
+ json_in = {
+ "witnesses" : [
+ {
+ "id" : "A",
+ "tokens" : [
+ { "t" : "I", "ref" : 123 },
+ { "t" : "bought" , "adj" : True },
+ { "t" : "this", "id" : "x3" },
+ { "t" : "glass", "id" : "x4" },
+ { "t" : ",", "type" : "punct" },
+ { "t" : "because", "id" : "x5" },
+ { "t" : "it", "id" : "x6" },
+ { "t" : "matches" },
+ { "t" : "those", "id" : "x7" },
+ { "t" : "dinner", "id" : "x8" },
+ { "t" : "plates", "id" : "x9" },
+ { "t" : ".", "type" : "punct" }
+ ]
+ },
+ {
+ "id" : "B",
+ "tokens" : [
+ { "t" : "I" },
+ { "t" : "bought" , "adj" : True },
+ { "t" : "those", "id" : "abc" },
+ { "t" : "glasses", "id" : "xyz" },
+ { "t" : ".", "type" : "punct" }
+ ]
+ }
+ ]
}
- ]
- }
+ self.c = Collation.create_from_dict(json_in)
def test_exact_matching(self):
- result = collate_pretokenized_json(self.json_in)
- self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."],
+ result = collate(self.c, segmentation=False)
+ self.assertEqual(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."],
result.rows[0].to_list())
- self.assertEquals(["I", "bought", "-", "-", "-", "-", "-", "-", "those", "glasses", "-", "."], result.rows[1].to_list())
+ self.assertEqual(["I", "bought", "-", "-", "-", "-", "-", "-", "those", "glasses", "-", "."], result.rows[1].to_list())
def test_near_matching(self):
- result = collate_pretokenized_json(self.json_in, near_match=True)
- self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."],
+ result = collate(self.c, segmentation=False, near_match=True)
+ self.assertEqual(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."],
result.rows[0].to_list())
- self.assertEquals(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list())
+ self.assertEqual(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list())
# Re-enable this one if segmented output is ever supported on tokenized collation
@unit_disabled
def test_near_matching_segmented(self):
- result = collate_pretokenized_json(self.json_in, near_match=True, segmentation=True)
- self.assertEquals(["I bought", "this glass, because it matches those dinner plates."],
+ result = collate(self.c, near_match=True, segmentation=True)
+ self.assertEqual(["I bought", "this glass, because it matches those dinner plates."],
result.rows[0].to_list())
- self.assertEquals(["I bought", "those glasses."], result.rows[1].to_list())
+ self.assertEqual(["I bought", "those glasses."], result.rows[1].to_list())
if __name__ == "__main__":
diff --git a/collatex-pythonport/tests/test_token_class.py b/collatex-pythonport/tests/test_token_class.py
new file mode 100644
index 000000000..fe4088ad0
--- /dev/null
+++ b/collatex-pythonport/tests/test_token_class.py
@@ -0,0 +1,39 @@
+'''
+Created on March 24, 2015
+
+@author: Elisa Nury
+'''
+
+import unittest
+from collatex.core_classes import Token
+from collatex.exceptions import TokenError
+
+
+class TestToken(unittest.TestCase):
+
+ def test_creation_token_t(self):
+ data = {'t': 'fox', 'id': 123 }
+ t = Token(data)
+ self.assertEqual(t.token_string, 'fox')
+ self.assertEqual(t.token_data, data)
+
+ def test_creation_token_n(self):
+ data = {'t': 'kitten', 'n': 'cat'}
+ t = Token(data)
+ self.assertEqual(t.token_string, 'cat')
+ self.assertEqual(t.token_data, data)
+
+ def test_creation_token_none(self):
+ t = Token(None)
+ self.assertEqual(t.token_string, '')
+ self.assertIsNone(t.token_data)
+
+ def test_invalid_token_raises_exception(self):
+ with self.assertRaises(TokenError):
+ #data = {'x': 'abc'}
+ data = {}
+ Token(data)
+
+if __name__ == '__main__':
+ unittest.main()
+
diff --git a/collatex-pythonport/tests/test_witness_class.py b/collatex-pythonport/tests/test_witness_class.py
new file mode 100644
index 000000000..8f2e6e33b
--- /dev/null
+++ b/collatex-pythonport/tests/test_witness_class.py
@@ -0,0 +1,54 @@
+'''
+Created on March 24, 2015
+
+@author: Elisa Nury
+'''
+
+import unittest
+from collatex.core_classes import Witness, Token, Tokenizer
+from collatex.exceptions import UnsupportedError, TokenError
+
+class TestWitness(unittest.TestCase):
+
+ def test_creation_witness_plain(self):
+ data = {'id': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'}
+ w = Witness(data)
+ self.assertEqual(w.sigil, 'A')
+ self.assertEqual(len(w.tokens()), 10)
+ self.assertEqual(w.tokens()[3].token_string, 'fox')
+
+ def test_creation_witness_pretokenized(self):
+ data = { 'id': 'B',
+ 'tokens': [
+ {'t': 'A', 'ref': 123},
+ {'t': 'black and blue', 'adj': True},
+ {'t': 'cat', 'id': 'xyz'},
+ {'t': 'bird.', 'id': 'abc'}
+ ]
+ }
+ w = Witness(data)
+ self.assertEqual(w.sigil, 'B')
+ self.assertEqual(len(w.tokens()), 4)
+
+ def test_invalid_witness_missing_id(self):
+ data = {'name': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'}
+ self.assertRaises(UnsupportedError, Witness, data)
+
+ def test_invalid_witness_missing_content_tokens(self):
+ data = {'id': 'A'}
+ self.assertRaises(UnsupportedError, Witness, data)
+
+ def test_invalid_witness_content_is_pretokenized(self):
+ #'content' is pretokenized instead of plain text
+ data = {'id': 'A', 'content': [{'t':'the'}, {'t':'fox'}]}
+ self.assertRaises(TypeError, Witness, data)
+
+ def test_invalid_witness_tokens_is_plain(self):
+ #'tokens' is plain text instead of pretokenized
+ data = {'id': 'A', 'tokens': 'The quick brown fox jumped over the lazy dogs.'}
+ self.assertRaises(TokenError, Witness, data)
+
+
+if __name__ == '__main__':
+ unittest.main()
+
diff --git a/collatex-pythonport/tests/test_witness_tokens.py b/collatex-pythonport/tests/test_witness_tokens.py
index 56e97f437..0a0ff7f07 100644
--- a/collatex-pythonport/tests/test_witness_tokens.py
+++ b/collatex-pythonport/tests/test_witness_tokens.py
@@ -6,7 +6,7 @@
import unittest
from collatex import Collation
-from collatex.core_functions import collate_pretokenized_json
+from collatex.core_functions import collate
class Test(unittest.TestCase):
@@ -52,7 +52,8 @@ def testPretokenizedWitness(self):
}
]
}
- result = collate_pretokenized_json(pretokenized_witness)
+ c = Collation.create_from_dict(pretokenized_witness)
+ result = collate(c, segmentation=False)
self.assertEqual(len(result.rows[0].to_list()), 4)
self.assertEqual(len(result.rows[1].to_list()), 4)
# The second witness should have a token that reads 'mousedog bird'.
diff --git a/collatex-pythonport/use_cases/json-test1.json b/collatex-pythonport/use_cases/json-test1.json
new file mode 100644
index 000000000..a419c0e49
--- /dev/null
+++ b/collatex-pythonport/use_cases/json-test1.json
@@ -0,0 +1,29 @@
+{"witnesses" :
+ [
+ {"id" : "A","tokens" :
+ [
+ {"t" : "The"},
+ {"t" : "quick"},
+ {"t" : "brown"},
+ {"t" : "fox"},
+ {"t" : "jumps"},
+ {"t" : "over"},
+ {"t" : "the"},
+ {"t" : "dog."}
+ ]
+ },
+
+ {"id" : "B", "tokens" :
+ [
+ {"t" : "The"},
+ {"t" : "brown"},
+ {"t" : "fox"},
+ {"t" : "jumps"},
+ {"t" : "over"},
+ {"t" : "the"},
+ {"t" : "lazy"},
+ {"t" : "dog."}
+ ]
+ }
+ ]
+}
diff --git a/collatex-pythonport/use_cases/json-test2.json b/collatex-pythonport/use_cases/json-test2.json
new file mode 100644
index 000000000..ca26f69af
--- /dev/null
+++ b/collatex-pythonport/use_cases/json-test2.json
@@ -0,0 +1,31 @@
+{"witnesses" :
+ [
+ {"id" : "C","tokens" :
+ [
+ {"t" : "The"},
+ {"t" : "quick"},
+ {"t" : "brown"},
+ {"t" : "fox"},
+ {"t" : "jumps"},
+ {"t" : "over"},
+ {"t" : "the"},
+ {"t" : "dog"},
+ {"t" : "."}
+ ]
+ },
+
+ {"id" : "D", "tokens" :
+ [
+ {"t" : "The"},
+ {"t" : "brown"},
+ {"t" : "fox"},
+ {"t" : "jumps"},
+ {"t" : "over"},
+ {"t" : "the"},
+ {"t" : "lazy"},
+ {"t" : "dog"},
+ {"t" : "."}
+ ]
+ }
+ ]
+}
diff --git a/collatex-pythonport/use_cases/json-test3.json b/collatex-pythonport/use_cases/json-test3.json
new file mode 100644
index 000000000..dcbee333e
--- /dev/null
+++ b/collatex-pythonport/use_cases/json-test3.json
@@ -0,0 +1,29 @@
+{"witnesses" :
+ [
+ {"id" : "E","tokens" :
+ [
+ {"t" : "The", "id": 1, "n": "the"},
+ {"t" : "quick", "id": 2},
+ {"t" : "brown", "id": 3},
+ {"t" : "fox", "id": 4},
+ {"t" : "jumps", "id": 5},
+ {"t" : "over", "id": 6},
+ {"t" : "the", "id": 7},
+ {"t" : "dog.", "id": 8, "n": "dog"}
+ ]
+ },
+
+ {"id" : "F", "tokens" :
+ [
+ {"t" : "The"},
+ {"t" : "brown"},
+ {"t" : "fox"},
+ {"t" : "jumps"},
+ {"t" : "over"},
+ {"t" : "the"},
+ {"t" : "lazy"},
+ {"t" : "dog."}
+ ]
+ }
+ ]
+}