interedition · enury · Feb 9, 2015 · Feb 9, 2015 · Feb 9, 2015 · Feb 9, 2015
diff --git a/collatex-pythonport/collatex/__init__.py b/collatex-pythonport/collatex/__init__.py
@@ -7,8 +7,7 @@
 
 from collatex.core_functions import Collation
 from collatex.core_functions import collate
-from collatex.core_functions import collate_pretokenized_json
 
-__all__ = ["Collation", "collate", "collate_pretokenized_json"]
+__all__ = ["Collation", "collate"]
 
 
diff --git a/collatex-pythonport/collatex/collatex_suffix.py b/collatex-pythonport/collatex/collatex_suffix.py
@@ -188,7 +188,7 @@ def __init__(self, occurrences, tokens):
     def debug(self):
         result = []
         for occurrence in self.occurrences:
-            result.append(' '.join(self.tokens[occurrence.token_range.slices().next()]))
+            result.append(' '.join(self.tokens[next(occurrence.token_range.slices())]))
         return result
 
 

diff --git a/collatex-pythonport/collatex/core_classes.py b/collatex-pythonport/collatex/core_classes.py
@@ -13,7 +13,7 @@
 import re
 from prettytable import PrettyTable
 from textwrap import fill
-from collatex.exceptions import TokenError
+from collatex.exceptions import TokenError, UnsupportedError
 
 class Row(object):
 
@@ -161,20 +161,20 @@ def __repr__(self):
 class Witness(object):
 
     def __init__(self, witnessdata):
+        if 'id' not in witnessdata:
+            raise UnsupportedError("No defined id in witnessdata")
         self.sigil = witnessdata['id']
         self._tokens = []
         if 'content' in witnessdata:
-            self.content = witnessdata['content']
-            # print("Witness "+sigil+" TOKENIZER IS CALLED!")
             tokenizer = WordPunctuationTokenizer()
-            tokens_as_strings = tokenizer.tokenize(self.content)
+            tokens_as_strings = tokenizer.tokenize(witnessdata['content'])
             for token_string in tokens_as_strings:
                 self._tokens.append(Token({'t':token_string}))
         elif 'tokens' in witnessdata:
             for tk in witnessdata['tokens']:
                 self._tokens.append(Token(tk))
-            # TODO no idea what this content string is needed for.
-            self.content = ' '.join([x.token_string for x in self._tokens])
+        else:
+            raise UnsupportedError("No defined content/tokens in witness "+self.sigil)
 
     def tokens(self):
         return self._tokens

diff --git a/collatex-pythonport/collatex/core_functions.py b/collatex-pythonport/collatex/core_functions.py
@@ -27,59 +27,59 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
     # check which output format is requested: graph or table
     if output=="graph": 
         return graph
+
     # create alignment table
     table = AlignmentTable(collation, graph, layout)
+    if collation.pretokenized and not segmentation:
+        token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses]
+        # only with segmentation=False
+        # there could be a different comportment of get_tokenized_table if semgentation=True
+        table = get_tokenized_at(table, token_list, segmentation=segmentation, layout=layout)
+        # for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict)
+        if output=="table" or output=="html":
+            for row in table.rows:
+                row.cells = [cell["t"] for cell in row.cells]
+
     if output == "json":
-        return export_alignment_table_as_json(table)
+        return export_alignment_table_as_json(table, layout=layout)
     if output == "html":
         return display_alignment_table_as_HTML(table)
     if output == "table":
         return table
     else:
         raise Exception("Unknown output type: "+output)
 
-
-
-#TODO: this only works with a table output at the moment
-#TODO: store the tokens on the graph instead
-def collate_pretokenized_json(json, output='table', layout='horizontal', **kwargs):
-    # Takes more or less the same arguments as collate() above, but with some restrictions.
-    # Only output types 'json' and 'table' are supported.
-    if output not in ['json', 'table']:
-        raise UnsupportedError("Output type" + kwargs['output'] + "not supported for pretokenized collation")
-    if 'segmentation' in kwargs and kwargs['segmentation']:
-        raise UnsupportedError("Segmented output not supported for pretokenized collation")
-    kwargs['segmentation'] = False
-
-    # For each witness given, make a 'shadow' witness based on the normalization tokens
-    # that will actually be collated.
-    tokenized_witnesses = []
-    collation = Collation()
-    for witness in json["witnesses"]:
-        collation.add_witness(witness)
-        tokenized_witnesses.append(witness["tokens"])
-    at = collate(collation, output="table", **kwargs)
-    tokenized_at = AlignmentTable(collation, layout=layout)
-    for row, tokenized_witness in zip(at.rows, tokenized_witnesses):
-        new_row = Row(row.header)
+def get_tokenized_at(table, token_list, segmentation=False, layout="horizontal"):
+    tokenized_at = AlignmentTable(Collation(), layout=layout)
+    for witness_row, witness_tokens in zip(table.rows, token_list):
+        new_row = Row(witness_row.header)
         tokenized_at.rows.append(new_row)
-        token_counter = 0
-        for cell in row.cells:
-            if cell != "-":
-                new_row.cells.append(tokenized_witness[token_counter])
-                token_counter+=1
-            else:
-                #TODO: should probably be null or None instead, but that would break the rendering at the moment 
-                new_row.cells.append({"t":"-"})
-    if output=="json":
-        return export_alignment_table_as_json(tokenized_at)
-    if output=="table":
-        # transform JSON objects to "t" form.
-        for row in tokenized_at.rows:
-            row.cells = [cell["t"]  for cell in row.cells]
-        return tokenized_at
-
-def export_alignment_table_as_json(table, indent=None, status=False):
+        counter = 0
+        for cell in witness_row.cells:
+            if cell == "-":
+                # TODO: should probably be null or None instead, but that would break the rendering at the moment (line 41)
+                new_row.cells.append({"t" : "-"})
+            # if segmentation=False    
+            else: 
+                new_row.cells.append(witness_tokens[counter])
+                counter+=1
+            # else if segmentation=True
+                ##token_list must be a list of Token instead of list of dict (update lines 34, 64)
+                ##line 41 will not be happy in case of table/html output
+                #string = witness_tokens[counter].token_string
+                #token_counter = 1
+                #while string != cell :
+                #    if counter+token_counter-1 < len(witness_tokens)-1:
+                #        #add token_string of the next token until it is equivalent to the string in the cell
+                #        #if we are not at the last token
+                #        string += ' '+witness_tokens[counter+token_counter].token_string
+                #        token_counter += 1
+                ##there is one list level too many in the output
+                #new_row.cells.append([tk.token_data for tk in witness_tokens[counter:counter+token_counter]])
+                #counter += token_counter.
+    return tokenized_at
+
+def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"):
     json_output = {}
     json_output["table"]=[]
     sigli = []
@@ -92,6 +92,9 @@ def export_alignment_table_as_json(table, indent=None, status=False):
         for column in table.columns:
             variant_status.append(column.variant)
         json_output["status"]=variant_status
+    if layout=="vertical":
+        new_table = [[row[i] for row in json_output["table"]] for i in range(len(row.cells))]
+        json_output["table"] = new_table
     return json.dumps(json_output, sort_keys=True, indent=indent)
 
 '''
@@ -101,29 +104,40 @@ class Collation(object):
 
     @classmethod
     def create_from_dict(cls, data, limit=None):
+        if "witnesses" not in data:
+            raise UnsupportedError("Json input not valid")
         witnesses = data["witnesses"]
         collation = Collation()
         for witness in witnesses[:limit]:
             # generate collation object from json_data
             collation.add_witness(witness)
+            # determine if data is pretokenized
+            if 'tokens' in witness:
+                collation.pretokenized = True
         return collation
 
+    # json input can be a string or a file
+    @classmethod
+    def create_from_json_string(cls, json_string):
+        data = json.loads(json_string)
+        collation = cls.create_from_dict(data)
+        return collation
+
     @classmethod
-    # json_data can be a string or a file
-    def create_from_json(cls, json_data):
-        data = json.load(json_data)
+    def create_from_json_file(cls, json_path):
+        with open(json_path, 'r') as json_file:
+            data = json.load(json_file)
         collation = cls.create_from_dict(data)
         return collation
 
     def __init__(self):
         self.witnesses = []
+        self.pretokenized = False
         self.counter = 0
         self.witness_ranges = {}
-        self.combined_string = ""
         self.cached_suffix_array = None
+        self.combined_tokens =[]
 
-    # the tokenization process happens multiple times
-    # and by different tokenizers. This should be fixed
     def add_witness(self, witnessdata):
         # clear the suffix array and LCP array cache
         self.cached_suffix_array = None
@@ -134,9 +148,11 @@ def add_witness(self, witnessdata):
         # the extra one is for the marker token
         self.counter += len(witness.tokens()) +2 # $ + number 
         self.witness_ranges[witness.sigil] = witness_range
-        if not self.combined_string == "":
-            self.combined_string += " $"+str(len(self.witnesses)-1)+ " "
-        self.combined_string += witness.content
+        if len(self.witnesses) > 1:
+            self.combined_tokens.append('$')
+            self.combined_tokens.append(str(len(self.witnesses)-1))
+        for tk in witness.tokens():
+            self.combined_tokens.append(tk.token_string)
 
     def add_plain_witness(self, sigil, content):
         return self.add_witness({'id':sigil, 'content':content})
@@ -146,14 +162,11 @@ def get_range_for_witness(self, witness_sigil):
             raise Exception("Witness "+witness_sigil+" is not added to the collation!")
         return self.witness_ranges[witness_sigil]
 
-    def get_combined_string(self):
-        return self.combined_string
-
     def get_sa(self):
         #NOTE: implemented in a lazy manner, since calculation of the Suffix Array and LCP Array takes time
         if not self.cached_suffix_array:
             # Unit byte is done to skip tokenization in third party library
-            self.cached_suffix_array = SuffixArray(self.tokens, unit=UNIT_BYTE)
+            self.cached_suffix_array = SuffixArray(self.combined_tokens, unit=UNIT_BYTE)
         return self.cached_suffix_array
 
     def get_suffix_array(self):
@@ -164,17 +177,7 @@ def get_lcp_array(self):
         sa = self.get_sa()
         return sa._LCP_values
 
-
     def to_extended_suffix_array(self):
-        return ExtendedSuffixArray(self.tokens, self.get_suffix_array(), self.get_lcp_array())
-
-    @property
-    def tokens(self):
-        #print("COLLATION TOKENIZE IS CALLED!")
-        #TODO: complete set of witnesses is retokenized here!
-        tokenizer = WordPunctuationTokenizer()
-        tokens = tokenizer.tokenize(self.get_combined_string())
-        return tokens
-
+        return ExtendedSuffixArray(self.combined_tokens, self.get_suffix_array(), self.get_lcp_array())
 
 
diff --git a/collatex-pythonport/collatex/suffix_based_scorer.py b/collatex-pythonport/collatex/suffix_based_scorer.py
@@ -147,7 +147,7 @@ def _get_block_witness(self, witness):
                 occurrences.append(occurrence) 
         # sort occurrences on position
         sorted_o = sorted(occurrences, key=attrgetter('lower_end'))
-        block_witness = BlockWitness(sorted_o, self.collation.tokens)
+        block_witness = BlockWitness(sorted_o, self.collation.combined_tokens)
         return block_witness
 
     '''