diff --git a/collatex-pythonport/collatex/__init__.py b/collatex-pythonport/collatex/__init__.py index 72cd2d045..a277707f2 100755 --- a/collatex-pythonport/collatex/__init__.py +++ b/collatex-pythonport/collatex/__init__.py @@ -7,8 +7,7 @@ from collatex.core_functions import Collation from collatex.core_functions import collate -from collatex.core_functions import collate_pretokenized_json -__all__ = ["Collation", "collate", "collate_pretokenized_json"] +__all__ = ["Collation", "collate"] diff --git a/collatex-pythonport/collatex/collatex_suffix.py b/collatex-pythonport/collatex/collatex_suffix.py index 4cd5ab4ee..713f7f3ac 100644 --- a/collatex-pythonport/collatex/collatex_suffix.py +++ b/collatex-pythonport/collatex/collatex_suffix.py @@ -188,7 +188,7 @@ def __init__(self, occurrences, tokens): def debug(self): result = [] for occurrence in self.occurrences: - result.append(' '.join(self.tokens[occurrence.token_range.slices().next()])) + result.append(' '.join(self.tokens[next(occurrence.token_range.slices())])) return result diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py index 6d62f2a06..9f1d21d92 100644 --- a/collatex-pythonport/collatex/core_classes.py +++ b/collatex-pythonport/collatex/core_classes.py @@ -13,7 +13,7 @@ import re from prettytable import PrettyTable from textwrap import fill -from collatex.exceptions import TokenError +from collatex.exceptions import TokenError, UnsupportedError class Row(object): @@ -161,20 +161,20 @@ def __repr__(self): class Witness(object): def __init__(self, witnessdata): + if 'id' not in witnessdata: + raise UnsupportedError("No defined id in witnessdata") self.sigil = witnessdata['id'] self._tokens = [] if 'content' in witnessdata: - self.content = witnessdata['content'] - # print("Witness "+sigil+" TOKENIZER IS CALLED!") tokenizer = WordPunctuationTokenizer() - tokens_as_strings = tokenizer.tokenize(self.content) + tokens_as_strings = tokenizer.tokenize(witnessdata['content']) for token_string in tokens_as_strings: self._tokens.append(Token({'t':token_string})) elif 'tokens' in witnessdata: for tk in witnessdata['tokens']: self._tokens.append(Token(tk)) - # TODO no idea what this content string is needed for. - self.content = ' '.join([x.token_string for x in self._tokens]) + else: + raise UnsupportedError("No defined content/tokens in witness "+self.sigil) def tokens(self): return self._tokens diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py index c2d2d4709..318e0437f 100644 --- a/collatex-pythonport/collatex/core_functions.py +++ b/collatex-pythonport/collatex/core_functions.py @@ -27,10 +27,21 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n # check which output format is requested: graph or table if output=="graph": return graph + # create alignment table table = AlignmentTable(collation, graph, layout) + if collation.pretokenized and not segmentation: + token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses] + # only with segmentation=False + # there could be a different comportment of get_tokenized_table if semgentation=True + table = get_tokenized_at(table, token_list, segmentation=segmentation, layout=layout) + # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict) + if output=="table" or output=="html": + for row in table.rows: + row.cells = [cell["t"] for cell in row.cells] + if output == "json": - return export_alignment_table_as_json(table) + return export_alignment_table_as_json(table, layout=layout) if output == "html": return display_alignment_table_as_HTML(table) if output == "table": @@ -38,48 +49,37 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n else: raise Exception("Unknown output type: "+output) - - -#TODO: this only works with a table output at the moment -#TODO: store the tokens on the graph instead -def collate_pretokenized_json(json, output='table', layout='horizontal', **kwargs): - # Takes more or less the same arguments as collate() above, but with some restrictions. - # Only output types 'json' and 'table' are supported. - if output not in ['json', 'table']: - raise UnsupportedError("Output type" + kwargs['output'] + "not supported for pretokenized collation") - if 'segmentation' in kwargs and kwargs['segmentation']: - raise UnsupportedError("Segmented output not supported for pretokenized collation") - kwargs['segmentation'] = False - - # For each witness given, make a 'shadow' witness based on the normalization tokens - # that will actually be collated. - tokenized_witnesses = [] - collation = Collation() - for witness in json["witnesses"]: - collation.add_witness(witness) - tokenized_witnesses.append(witness["tokens"]) - at = collate(collation, output="table", **kwargs) - tokenized_at = AlignmentTable(collation, layout=layout) - for row, tokenized_witness in zip(at.rows, tokenized_witnesses): - new_row = Row(row.header) +def get_tokenized_at(table, token_list, segmentation=False, layout="horizontal"): + tokenized_at = AlignmentTable(Collation(), layout=layout) + for witness_row, witness_tokens in zip(table.rows, token_list): + new_row = Row(witness_row.header) tokenized_at.rows.append(new_row) - token_counter = 0 - for cell in row.cells: - if cell != "-": - new_row.cells.append(tokenized_witness[token_counter]) - token_counter+=1 - else: - #TODO: should probably be null or None instead, but that would break the rendering at the moment - new_row.cells.append({"t":"-"}) - if output=="json": - return export_alignment_table_as_json(tokenized_at) - if output=="table": - # transform JSON objects to "t" form. - for row in tokenized_at.rows: - row.cells = [cell["t"] for cell in row.cells] - return tokenized_at - -def export_alignment_table_as_json(table, indent=None, status=False): + counter = 0 + for cell in witness_row.cells: + if cell == "-": + # TODO: should probably be null or None instead, but that would break the rendering at the moment (line 41) + new_row.cells.append({"t" : "-"}) + # if segmentation=False + else: + new_row.cells.append(witness_tokens[counter]) + counter+=1 + # else if segmentation=True + ##token_list must be a list of Token instead of list of dict (update lines 34, 64) + ##line 41 will not be happy in case of table/html output + #string = witness_tokens[counter].token_string + #token_counter = 1 + #while string != cell : + # if counter+token_counter-1 < len(witness_tokens)-1: + # #add token_string of the next token until it is equivalent to the string in the cell + # #if we are not at the last token + # string += ' '+witness_tokens[counter+token_counter].token_string + # token_counter += 1 + ##there is one list level too many in the output + #new_row.cells.append([tk.token_data for tk in witness_tokens[counter:counter+token_counter]]) + #counter += token_counter. + return tokenized_at + +def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"): json_output = {} json_output["table"]=[] sigli = [] @@ -92,6 +92,9 @@ def export_alignment_table_as_json(table, indent=None, status=False): for column in table.columns: variant_status.append(column.variant) json_output["status"]=variant_status + if layout=="vertical": + new_table = [[row[i] for row in json_output["table"]] for i in range(len(row.cells))] + json_output["table"] = new_table return json.dumps(json_output, sort_keys=True, indent=indent) ''' @@ -101,29 +104,40 @@ class Collation(object): @classmethod def create_from_dict(cls, data, limit=None): + if "witnesses" not in data: + raise UnsupportedError("Json input not valid") witnesses = data["witnesses"] collation = Collation() for witness in witnesses[:limit]: # generate collation object from json_data collation.add_witness(witness) + # determine if data is pretokenized + if 'tokens' in witness: + collation.pretokenized = True return collation + # json input can be a string or a file + @classmethod + def create_from_json_string(cls, json_string): + data = json.loads(json_string) + collation = cls.create_from_dict(data) + return collation + @classmethod - # json_data can be a string or a file - def create_from_json(cls, json_data): - data = json.load(json_data) + def create_from_json_file(cls, json_path): + with open(json_path, 'r') as json_file: + data = json.load(json_file) collation = cls.create_from_dict(data) return collation def __init__(self): self.witnesses = [] + self.pretokenized = False self.counter = 0 self.witness_ranges = {} - self.combined_string = "" self.cached_suffix_array = None + self.combined_tokens =[] - # the tokenization process happens multiple times - # and by different tokenizers. This should be fixed def add_witness(self, witnessdata): # clear the suffix array and LCP array cache self.cached_suffix_array = None @@ -134,9 +148,11 @@ def add_witness(self, witnessdata): # the extra one is for the marker token self.counter += len(witness.tokens()) +2 # $ + number self.witness_ranges[witness.sigil] = witness_range - if not self.combined_string == "": - self.combined_string += " $"+str(len(self.witnesses)-1)+ " " - self.combined_string += witness.content + if len(self.witnesses) > 1: + self.combined_tokens.append('$') + self.combined_tokens.append(str(len(self.witnesses)-1)) + for tk in witness.tokens(): + self.combined_tokens.append(tk.token_string) def add_plain_witness(self, sigil, content): return self.add_witness({'id':sigil, 'content':content}) @@ -146,14 +162,11 @@ def get_range_for_witness(self, witness_sigil): raise Exception("Witness "+witness_sigil+" is not added to the collation!") return self.witness_ranges[witness_sigil] - def get_combined_string(self): - return self.combined_string - def get_sa(self): #NOTE: implemented in a lazy manner, since calculation of the Suffix Array and LCP Array takes time if not self.cached_suffix_array: # Unit byte is done to skip tokenization in third party library - self.cached_suffix_array = SuffixArray(self.tokens, unit=UNIT_BYTE) + self.cached_suffix_array = SuffixArray(self.combined_tokens, unit=UNIT_BYTE) return self.cached_suffix_array def get_suffix_array(self): @@ -164,17 +177,7 @@ def get_lcp_array(self): sa = self.get_sa() return sa._LCP_values - def to_extended_suffix_array(self): - return ExtendedSuffixArray(self.tokens, self.get_suffix_array(), self.get_lcp_array()) - - @property - def tokens(self): - #print("COLLATION TOKENIZE IS CALLED!") - #TODO: complete set of witnesses is retokenized here! - tokenizer = WordPunctuationTokenizer() - tokens = tokenizer.tokenize(self.get_combined_string()) - return tokens - + return ExtendedSuffixArray(self.combined_tokens, self.get_suffix_array(), self.get_lcp_array()) diff --git a/collatex-pythonport/collatex/suffix_based_scorer.py b/collatex-pythonport/collatex/suffix_based_scorer.py index cfb2713f2..27c8b468e 100644 --- a/collatex-pythonport/collatex/suffix_based_scorer.py +++ b/collatex-pythonport/collatex/suffix_based_scorer.py @@ -147,7 +147,7 @@ def _get_block_witness(self, witness): occurrences.append(occurrence) # sort occurrences on position sorted_o = sorted(occurrences, key=attrgetter('lower_end')) - block_witness = BlockWitness(sorted_o, self.collation.tokens) + block_witness = BlockWitness(sorted_o, self.collation.combined_tokens) return block_witness ''' diff --git a/collatex-pythonport/tests/test_collate_outputs.py b/collatex-pythonport/tests/test_collate_outputs.py new file mode 100644 index 000000000..cb9c8e5dd --- /dev/null +++ b/collatex-pythonport/tests/test_collate_outputs.py @@ -0,0 +1,275 @@ +''' +Created on March 24, 2015 + +@author: Elisa Nury +''' + +import unittest +from collatex.core_functions import * +from collatex.exceptions import UnsupportedError + +class TestCollate(unittest.TestCase): + def test_collate_with_invalid_output(self): + data = {"witnesses" : + [ + {"id" : "A", "tokens" : + [ + {"t": "A", "id": 1}, + {"t": "small"}, + {"t": "black"}, + {"t": "cat"} + ] + }, + {"id" : "B", "tokens" : + [ + {"t": "A"}, + {"t": "small"}, + {"t": "white"}, + {"t": "kitten.", "n": "cat"} + ] + } + ] + } + c = Collation.create_from_dict(data) + with self.assertRaises(Exception): + collate(c, output="xyz") + + def test_collate_with_empty_collation(self): + c = Collation() + with self.assertRaises(IndexError): + collate(c) + + +class TestTokenizedJsonOutput(unittest.TestCase): + def setUp(self): + self.data = {"witnesses" : + [ + {"id" : "A", "tokens" : + [ + {"t": "A", "id": 1}, + {"t": "small"}, + {"t": "black"}, + {"t": "cat"} + ] + }, + {"id" : "B", "tokens" : + [ + {"t": "A"}, + {"t": "small"}, + {"t": "white"}, + {"t": "kitten.", "n": "cat"} + ] + } + ] + } + self.c = Collation.create_from_dict(self.data) + self.maxDiff = None + + #-------------------------------------------------- + #JSON output + def test_tokenized_output_json_segmentationFalse_layoutHorizontal(self): + expected = '{"table": [[[{"id": 1, "t": "A"}], [{"t": "small"}], [{"t": "black"}], [{"t": "cat"}]], [[{"t": "A"}], [{"t": "small"}], [{"t": "white"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=False, layout="horizontal") + self.assertEqual(output, expected) + + def test_tokenized_output_json_segmentationFalse_layoutVertical(self): + expected = '{"table": [[[{"id": 1, "t": "A"}], [{"t": "A"}]], [[{"t": "small"}], [{"t": "small"}]], [[{"t": "black"}], [{"t": "white"}]], [[{"t": "cat"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=False, layout="vertical") + self.assertEqual(output, expected) + + def test_tokenized_output_json_segmentationTrue_layoutHorizontal(self): + expected = '{"table": [[["A small"], ["black"], ["cat"]], [["A small"], ["white"], ["cat"]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=True, layout="horizontal") + self.assertEqual(output, expected) + + def test_tokenized_output_json_segmentationTrue_layoutVertical(self): + expected = '{"table": [[["A small"], ["A small"]], [["black"], ["white"]], [["cat"], ["cat"]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=True, layout="vertical") + self.assertEqual(output, expected) + + #-------------------------------------------------- + #TABLE output + + def test_tokenized_output_table_segmentationFalse_layoutHorizontal(self): + expected = """\ ++---+---+-------+-------+---------+ +| A | A | small | black | cat | +| B | A | small | white | kitten. | ++---+---+-------+-------+---------+""" + output = str(collate(self.c, output="table", segmentation=False, layout="horizontal")) + self.assertEqual(output, expected) + + def test_tokenized_output_table_segmentationFalse_layoutVertical(self): + expected = '''\ ++-------+---------+ +| A | B | ++-------+---------+ +| A | A | ++-------+---------+ +| small | small | ++-------+---------+ +| black | white | ++-------+---------+ +| cat | kitten. | ++-------+---------+''' + output = str(collate(self.c, output="table", segmentation=False, layout="vertical")) + self.assertEqual(output, expected) + + def test_tokenized_output_table_segmentationTrue_layoutHorizontal(self): + expected = """\ ++---+---------+-------+-----+ +| A | A small | black | cat | +| B | A small | white | cat | ++---+---------+-------+-----+""" + output = str(collate(self.c, output="table", segmentation=True, layout="horizontal")) + self.assertEqual(output, expected) + + def test_tokenized_output_table_segmentationTrue_layoutVertical(self): + expected = '''\ ++---------+---------+ +| A | B | ++---------+---------+ +| A small | A small | ++---------+---------+ +| black | white | ++---------+---------+ +| cat | cat | ++---------+---------+''' + output = str(collate(self.c, output="table", segmentation=True, layout="vertical")) + self.assertEqual(output, expected) + + #-------------------------------------------------- + #HTML output + + def test_tokenized_output_html_segmentationFalse_layoutHorizontal(self): + expected = '''\ + + + + + + + + + + + + + + + +
AAsmallblackcat
BAsmallwhitekitten.
''' + output = collate(self.c, output="html", segmentation=False, layout="horizontal") + self.assertEqual(output, expected) + + def test_tokenized_output_html_segmentationFalse_layoutVertical(self): + expected = '''\ + + + + + + + + + + + + + + + + + + + + + +
AB
AA
smallsmall
blackwhite
catkitten.
''' + output = collate(self.c, output="html", segmentation=False, layout="vertical") + self.assertEqual(output, expected) + + def test_tokenized_output_html_segmentationTrue_layoutHorizontal(self): + expected = '''\ + + + + + + + + + + + + + +
AA smallblackcat
BA smallwhitecat
''' + output = collate(self.c, output="html", segmentation=True, layout="horizontal") + self.assertEqual(output, expected) + + def test_tokenized_output_html_segmentationTrue_layoutVertical(self): + expected = '''\ + + + + + + + + + + + + + + + + + +
AB
A smallA small
blackwhite
catcat
''' + output = collate(self.c, output="html", segmentation=True, layout="vertical") + self.assertEqual(output, expected) + + + + +#-------------------------------------------------- +#Empty cells output + +class TestOutputEmptyCells(unittest.TestCase): + def setUp(self): + data = { + "witnesses" : [ + { + "id" : "A", + "tokens" : [ + { "t" : "A"}, + { "t" : "black"}, + { "t" : "cat"} + ] + }, + { + "id" : "B", + "tokens" : [ + { "t": "A" }, + { "t": "kitten.", "n": "cat" } + ] + } + ] + } + self.c = Collation.create_from_dict(data) + + def test_json_segmentationTrue_output_with_empty_cells(self): + expected = '{"table": [[["A"], ["black"], ["cat"]], [["A"], ["-"], ["cat"]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json") + self.assertEqual(output, expected) + + def test_json_segmentationFalse_output_with_empty_cells(self): + expected = '{"table": [[[{"t": "A"}], [{"t": "black"}], [{"t": "cat"}]], [[{"t": "A"}], [{"t": "-"}], [{"n": "cat", "t": "kitten."}]]], "witnesses": ["A", "B"]}' + output = collate(self.c, output="json", segmentation=False) + self.assertEqual(output, expected) + + +if __name__ == '__main__': + unittest.main() diff --git a/collatex-pythonport/tests/test_collatex_block_witnesses.py b/collatex-pythonport/tests/test_collatex_block_witnesses.py index 7cb412822..ee7cc3790 100644 --- a/collatex-pythonport/tests/test_collatex_block_witnesses.py +++ b/collatex-pythonport/tests/test_collatex_block_witnesses.py @@ -29,16 +29,15 @@ def test_combined_string_hermans_case(self): collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") # $ is meant to separate witnesses here - self.assertEquals("a b c d F g h i ! K ! q r s t $1 a b c d F g h i ! q r s t", collation.get_combined_string()) + self.assertEquals("a b c d F g h i ! K ! q r s t $ 1 a b c d F g h i ! q r s t", " ".join(collation.combined_tokens)) # test whether the witness->range mapping works - @unit_disabled def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") self.assertEquals(RangeSet("0-14"), collation.get_range_for_witness("W1")) - self.assertEquals(RangeSet("16-28"), collation.get_range_for_witness("W2")) + self.assertEquals(RangeSet("17-29"), collation.get_range_for_witness("W2")) # TODO: re-enable test! # Note: LCP intervals can overlap @@ -74,14 +73,13 @@ def test_lcp_child_intervals_hermans_case(self): _, child_lcp_intervals = collation.get_lcp_intervals() self.assertFalse(child_lcp_intervals) - @unit_disabled def test_non_overlapping_blocks_black_cat(self): collation = Collation() collation.add_plain_witness("W1", "the black cat") collation.add_plain_witness("W2", "the black cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() - block1 = Block(RangeSet("0-2, 4-6")) + block1 = Block(RangeSet("0-2, 5-7")) self.assertEqual([block1], blocks) #TODO: Fix number of siblings! @@ -97,17 +95,15 @@ def test_blocks_failing_transposition_use_case_old_algorithm(self): block3 = Block(RangeSet("2, 8")) self.assertEqual([block1, block2, block3], blocks) - @unit_disabled def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() - self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! - self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t + self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i ! + self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t - @unit_disabled def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") @@ -115,24 +111,22 @@ def test_blocks_Hermans_case_three_witnesses(self): collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() - self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d - self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i - self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t - self.assertIn(Block(RangeSet("4, 20")), blocks) # F + self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d + self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i + self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t + self.assertIn(Block(RangeSet("4, 21")), blocks) # F # In the new approach nothing should be split - @unit_disabled def test_blocks_splitting_token_case(self): collation = Collation() collation.add_plain_witness("W1", "a c b c") collation.add_plain_witness("W2", "a c b") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() - block1 = Block(RangeSet("0-2, 5-7")) # a c b + block1 = Block(RangeSet("0-2, 6-8")) # a c b self.assertIn(block1, blocks) - @unit_disabled def test_block_witnesses_Hermans_case_two_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") @@ -143,7 +137,6 @@ def test_block_witnesses_Hermans_case_two_witnesses(self): block_witness = algorithm._get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug()) - @unit_disabled def test_block_witnesses_Hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") @@ -241,4 +234,4 @@ def test_filter_potential_blocks(self): if __name__ == "__main__": #import sys;sys.argv = ['', 'Test.testName'] - unittest.main() \ No newline at end of file + unittest.main() diff --git a/collatex-pythonport/tests/test_collation_class.py b/collatex-pythonport/tests/test_collation_class.py new file mode 100644 index 000000000..fd3d2ac7a --- /dev/null +++ b/collatex-pythonport/tests/test_collation_class.py @@ -0,0 +1,85 @@ +''' +Created on March 24, 2015 + +@author: Elisa Nury +''' + +import unittest +from collatex.core_functions import * +from collatex.exceptions import UnsupportedError +from testfixtures import TempDirectory +import os +import json + +class TestCollationMethods(unittest.TestCase): + + def test_collation_method_create_from_json_file(self): + with TempDirectory() as d: + #create a temporary file in a temporary directory + d.write('testfile.json', b'{"witnesses" : [{"id" : "A", "content" : "The fox."}, {"id" : "B", "content": "The dog"}]}') + c = Collation.create_from_json_file(os.path.join(d.path, 'testfile.json')) + self.assertEqual(len(c.witnesses), 2) + + def test_collation_create_from_dict(self): + data = {"witnesses" : [{"id" : "A", "content" : "The fox."}, {"id" : "B", "content": "The dog"}]} + c = Collation.create_from_dict(data) + self.assertEqual(len(c.witnesses), 2) + + +class TestCollationFunctions(unittest.TestCase): + def setUp(self): + data = { + 'witnesses' : [ + { + 'id' : 'A', + 'content' : 'The cat' + }, + { + 'id' : 'B', + 'tokens' : [ + { 't' : 'The'}, + { 't' : 'kitten'} + ] + } + ] + } + self.c = Collation.create_from_dict(data) + + def test_collation_function_add_plain_witness(self): + self.c.add_plain_witness('C', 'A cat') + self.assertEqual(len(self.c.witnesses), 3) + + def test_collation_function_add_witness(self): + witnessdata = {'id': 'C', 'tokens': [{ 't' : 'A'},{ 't' : 'cat'}]} + self.c.add_witness(witnessdata) + self.assertEqual(len(self.c.witnesses), 3) + + @unittest.expectedFailure + def test_collation_function_add_witnesses_with_same_id(self): + witnessdata1 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'fox'}]} + witnessdata2 = {'id': 'C', 'tokens': [{ 't' : 'The'},{ 't': 'dog'}]} + self.c.add_witness(witnessdata1) + self.c.add_witness(witnessdata2) + self.assertEqual(len(self.c.witnesses), 4) + + #error in the collation result => there should be an exception raised... + #json_result = json.loads(collate(self.c, output='json')) + #self.assertEqual(json_result['table'][2][1], 'fox') + #self.assertEqual(json_result['table'][3][1], 'dog') + self.fail("It should not be possible to add 2 witnesses with the same id") + + def test_collation_function_get_range_for_witness(self): + expected_range_B = RangeSet() + expected_range_B.add_range(4, 6) + self.assertEqual(self.c.get_range_for_witness('B'), expected_range_B) + self.assertRaises(Exception, self.c.get_range_for_witness, 'W') + + #test other functions? + #get suffix array + #get sa + #get lcp array + #to extended suffix array + + +if __name__ == '__main__': + unittest.main() diff --git a/collatex-pythonport/tests/test_near_matching_pretokenized.py b/collatex-pythonport/tests/test_near_matching_pretokenized.py index cad73a67e..7beb7f043 100644 --- a/collatex-pythonport/tests/test_near_matching_pretokenized.py +++ b/collatex-pythonport/tests/test_near_matching_pretokenized.py @@ -5,61 +5,63 @@ ''' import unittest from tests import unit_disabled -from collatex.core_functions import collate_pretokenized_json +from collatex.core_functions import * class Test(unittest.TestCase): - json_in = { - "witnesses" : [ - { - "id" : "A", - "tokens" : [ - { "t" : "I", "ref" : 123 }, - { "t" : "bought" , "adj" : True }, - { "t" : "this", "id" : "x3" }, - { "t" : "glass", "id" : "x4" }, - { "t" : ",", "type" : "punct" }, - { "t" : "because", "id" : "x5" }, - { "t" : "it", "id" : "x6" }, - { "t" : "matches" }, - { "t" : "those", "id" : "x7" }, - { "t" : "dinner", "id" : "x8" }, - { "t" : "plates", "id" : "x9" }, - { "t" : ".", "type" : "punct" } - ] - }, - { - "id" : "B", - "tokens" : [ - { "t" : "I" }, - { "t" : "bought" , "adj" : True }, - { "t" : "those", "id" : "abc" }, - { "t" : "glasses", "id" : "xyz" }, - { "t" : ".", "type" : "punct" } - ] + def setUp(self): + json_in = { + "witnesses" : [ + { + "id" : "A", + "tokens" : [ + { "t" : "I", "ref" : 123 }, + { "t" : "bought" , "adj" : True }, + { "t" : "this", "id" : "x3" }, + { "t" : "glass", "id" : "x4" }, + { "t" : ",", "type" : "punct" }, + { "t" : "because", "id" : "x5" }, + { "t" : "it", "id" : "x6" }, + { "t" : "matches" }, + { "t" : "those", "id" : "x7" }, + { "t" : "dinner", "id" : "x8" }, + { "t" : "plates", "id" : "x9" }, + { "t" : ".", "type" : "punct" } + ] + }, + { + "id" : "B", + "tokens" : [ + { "t" : "I" }, + { "t" : "bought" , "adj" : True }, + { "t" : "those", "id" : "abc" }, + { "t" : "glasses", "id" : "xyz" }, + { "t" : ".", "type" : "punct" } + ] + } + ] } - ] - } + self.c = Collation.create_from_dict(json_in) def test_exact_matching(self): - result = collate_pretokenized_json(self.json_in) - self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], + result = collate(self.c, segmentation=False) + self.assertEqual(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], result.rows[0].to_list()) - self.assertEquals(["I", "bought", "-", "-", "-", "-", "-", "-", "those", "glasses", "-", "."], result.rows[1].to_list()) + self.assertEqual(["I", "bought", "-", "-", "-", "-", "-", "-", "those", "glasses", "-", "."], result.rows[1].to_list()) def test_near_matching(self): - result = collate_pretokenized_json(self.json_in, near_match=True) - self.assertEquals(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], + result = collate(self.c, segmentation=False, near_match=True) + self.assertEqual(["I", "bought", "this", "glass", ",", "because", "it", "matches", "those", "dinner", "plates", "."], result.rows[0].to_list()) - self.assertEquals(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list()) + self.assertEqual(["I", "bought", "those", "glasses", "-", "-", "-", "-", "-", "-", "-", "."], result.rows[1].to_list()) # Re-enable this one if segmented output is ever supported on tokenized collation @unit_disabled def test_near_matching_segmented(self): - result = collate_pretokenized_json(self.json_in, near_match=True, segmentation=True) - self.assertEquals(["I bought", "this glass, because it matches those dinner plates."], + result = collate(self.c, near_match=True, segmentation=True) + self.assertEqual(["I bought", "this glass, because it matches those dinner plates."], result.rows[0].to_list()) - self.assertEquals(["I bought", "those glasses."], result.rows[1].to_list()) + self.assertEqual(["I bought", "those glasses."], result.rows[1].to_list()) if __name__ == "__main__": diff --git a/collatex-pythonport/tests/test_token_class.py b/collatex-pythonport/tests/test_token_class.py new file mode 100644 index 000000000..fe4088ad0 --- /dev/null +++ b/collatex-pythonport/tests/test_token_class.py @@ -0,0 +1,39 @@ +''' +Created on March 24, 2015 + +@author: Elisa Nury +''' + +import unittest +from collatex.core_classes import Token +from collatex.exceptions import TokenError + + +class TestToken(unittest.TestCase): + + def test_creation_token_t(self): + data = {'t': 'fox', 'id': 123 } + t = Token(data) + self.assertEqual(t.token_string, 'fox') + self.assertEqual(t.token_data, data) + + def test_creation_token_n(self): + data = {'t': 'kitten', 'n': 'cat'} + t = Token(data) + self.assertEqual(t.token_string, 'cat') + self.assertEqual(t.token_data, data) + + def test_creation_token_none(self): + t = Token(None) + self.assertEqual(t.token_string, '') + self.assertIsNone(t.token_data) + + def test_invalid_token_raises_exception(self): + with self.assertRaises(TokenError): + #data = {'x': 'abc'} + data = {} + Token(data) + +if __name__ == '__main__': + unittest.main() + diff --git a/collatex-pythonport/tests/test_witness_class.py b/collatex-pythonport/tests/test_witness_class.py new file mode 100644 index 000000000..8f2e6e33b --- /dev/null +++ b/collatex-pythonport/tests/test_witness_class.py @@ -0,0 +1,54 @@ +''' +Created on March 24, 2015 + +@author: Elisa Nury +''' + +import unittest +from collatex.core_classes import Witness, Token, Tokenizer +from collatex.exceptions import UnsupportedError, TokenError + +class TestWitness(unittest.TestCase): + + def test_creation_witness_plain(self): + data = {'id': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'} + w = Witness(data) + self.assertEqual(w.sigil, 'A') + self.assertEqual(len(w.tokens()), 10) + self.assertEqual(w.tokens()[3].token_string, 'fox') + + def test_creation_witness_pretokenized(self): + data = { 'id': 'B', + 'tokens': [ + {'t': 'A', 'ref': 123}, + {'t': 'black and blue', 'adj': True}, + {'t': 'cat', 'id': 'xyz'}, + {'t': 'bird.', 'id': 'abc'} + ] + } + w = Witness(data) + self.assertEqual(w.sigil, 'B') + self.assertEqual(len(w.tokens()), 4) + + def test_invalid_witness_missing_id(self): + data = {'name': 'A', 'content': 'The quick brown fox jumped over the lazy dogs.'} + self.assertRaises(UnsupportedError, Witness, data) + + def test_invalid_witness_missing_content_tokens(self): + data = {'id': 'A'} + self.assertRaises(UnsupportedError, Witness, data) + + def test_invalid_witness_content_is_pretokenized(self): + #'content' is pretokenized instead of plain text + data = {'id': 'A', 'content': [{'t':'the'}, {'t':'fox'}]} + self.assertRaises(TypeError, Witness, data) + + def test_invalid_witness_tokens_is_plain(self): + #'tokens' is plain text instead of pretokenized + data = {'id': 'A', 'tokens': 'The quick brown fox jumped over the lazy dogs.'} + self.assertRaises(TokenError, Witness, data) + + +if __name__ == '__main__': + unittest.main() + diff --git a/collatex-pythonport/tests/test_witness_tokens.py b/collatex-pythonport/tests/test_witness_tokens.py index 56e97f437..0a0ff7f07 100644 --- a/collatex-pythonport/tests/test_witness_tokens.py +++ b/collatex-pythonport/tests/test_witness_tokens.py @@ -6,7 +6,7 @@ import unittest from collatex import Collation -from collatex.core_functions import collate_pretokenized_json +from collatex.core_functions import collate class Test(unittest.TestCase): @@ -52,7 +52,8 @@ def testPretokenizedWitness(self): } ] } - result = collate_pretokenized_json(pretokenized_witness) + c = Collation.create_from_dict(pretokenized_witness) + result = collate(c, segmentation=False) self.assertEqual(len(result.rows[0].to_list()), 4) self.assertEqual(len(result.rows[1].to_list()), 4) # The second witness should have a token that reads 'mousedog bird'. diff --git a/collatex-pythonport/use_cases/json-test1.json b/collatex-pythonport/use_cases/json-test1.json new file mode 100644 index 000000000..a419c0e49 --- /dev/null +++ b/collatex-pythonport/use_cases/json-test1.json @@ -0,0 +1,29 @@ +{"witnesses" : + [ + {"id" : "A","tokens" : + [ + {"t" : "The"}, + {"t" : "quick"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "dog."} + ] + }, + + {"id" : "B", "tokens" : + [ + {"t" : "The"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "lazy"}, + {"t" : "dog."} + ] + } + ] +} diff --git a/collatex-pythonport/use_cases/json-test2.json b/collatex-pythonport/use_cases/json-test2.json new file mode 100644 index 000000000..ca26f69af --- /dev/null +++ b/collatex-pythonport/use_cases/json-test2.json @@ -0,0 +1,31 @@ +{"witnesses" : + [ + {"id" : "C","tokens" : + [ + {"t" : "The"}, + {"t" : "quick"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "dog"}, + {"t" : "."} + ] + }, + + {"id" : "D", "tokens" : + [ + {"t" : "The"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "lazy"}, + {"t" : "dog"}, + {"t" : "."} + ] + } + ] +} diff --git a/collatex-pythonport/use_cases/json-test3.json b/collatex-pythonport/use_cases/json-test3.json new file mode 100644 index 000000000..dcbee333e --- /dev/null +++ b/collatex-pythonport/use_cases/json-test3.json @@ -0,0 +1,29 @@ +{"witnesses" : + [ + {"id" : "E","tokens" : + [ + {"t" : "The", "id": 1, "n": "the"}, + {"t" : "quick", "id": 2}, + {"t" : "brown", "id": 3}, + {"t" : "fox", "id": 4}, + {"t" : "jumps", "id": 5}, + {"t" : "over", "id": 6}, + {"t" : "the", "id": 7}, + {"t" : "dog.", "id": 8, "n": "dog"} + ] + }, + + {"id" : "F", "tokens" : + [ + {"t" : "The"}, + {"t" : "brown"}, + {"t" : "fox"}, + {"t" : "jumps"}, + {"t" : "over"}, + {"t" : "the"}, + {"t" : "lazy"}, + {"t" : "dog."} + ] + } + ] +}