Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated project files and tests #19

Open
wants to merge 34 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
1413301
Update core_functions.py
enury Feb 9, 2015
6842ca8
Create json-test1
enury Feb 9, 2015
5d12d08
Create json-test2.json
enury Feb 9, 2015
aad18be
Delete json-test1
enury Feb 9, 2015
bebc856
Create json-test1.json
enury Feb 9, 2015
f68f4cc
Update json-test1.json
enury Feb 9, 2015
b0fa0b4
Update json-test2.json
enury Feb 9, 2015
ed8c8e0
Update core_functions.py
enury Feb 10, 2015
babd89c
Update core_functions.py
enury Feb 10, 2015
5b5ab04
Update core_functions.py
enury Feb 10, 2015
8bc6bec
Update Witness class in core_classes.py
enury Feb 10, 2015
eaa7a88
Update core_functions.py
enury Feb 11, 2015
6672e52
Update __init__.py
enury Feb 11, 2015
070f825
Update core_functions.py
enury Feb 11, 2015
42fb5eb
Update core_functions.py
enury Feb 11, 2015
3392a42
Update suffix_based_scorer.py
enury Feb 11, 2015
58a3a5e
Update core_functions.py
enury Feb 11, 2015
b5e2bad
Update core_classes.py
enury Feb 11, 2015
7fc92eb
Create json-test3.json
enury Feb 11, 2015
b7beb23
Update core_functions.py
enury Feb 11, 2015
a5bc15b
Update core_functions.py
enury Feb 12, 2015
42de8b1
Update core_functions.py
enury Feb 12, 2015
9aeaebb
Create test_token_class.py
enury Mar 24, 2015
13af6dd
Create test_witness_class.py
enury Mar 24, 2015
87675aa
Create test_collation_class.py
enury Mar 24, 2015
8b9b0ec
Create test_collate_outputs.py
enury Mar 24, 2015
42546cf
Update core_classes.py
enury Mar 24, 2015
c2923d5
Update test_collatex_block_witnesses.py
enury Mar 24, 2015
2914b6b
Update test_near_matching_pretokenized.py
enury Mar 24, 2015
977b0fa
Update test_witness_tokens.py
enury Mar 24, 2015
65a18f8
Update test_collatex_block_witnesses.py
enury Mar 24, 2015
c70db1a
Update test_collation_class.py
enury Mar 25, 2015
ecdb764
Port to Python 3:
enury Mar 25, 2015
193351e
Update test_collatex_block_witnesses.py
enury Mar 25, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions collatex-pythonport/collatex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@

from collatex.core_functions import Collation
from collatex.core_functions import collate
from collatex.core_functions import collate_pretokenized_json

__all__ = ["Collation", "collate", "collate_pretokenized_json"]
__all__ = ["Collation", "collate"]


2 changes: 1 addition & 1 deletion collatex-pythonport/collatex/collatex_suffix.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def __init__(self, occurrences, tokens):
def debug(self):
result = []
for occurrence in self.occurrences:
result.append(' '.join(self.tokens[occurrence.token_range.slices().next()]))
result.append(' '.join(self.tokens[next(occurrence.token_range.slices())]))
return result


Expand Down
12 changes: 6 additions & 6 deletions collatex-pythonport/collatex/core_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import re
from prettytable import PrettyTable
from textwrap import fill
from collatex.exceptions import TokenError
from collatex.exceptions import TokenError, UnsupportedError

class Row(object):

Expand Down Expand Up @@ -161,20 +161,20 @@ def __repr__(self):
class Witness(object):

def __init__(self, witnessdata):
if 'id' not in witnessdata:
raise UnsupportedError("No defined id in witnessdata")
self.sigil = witnessdata['id']
self._tokens = []
if 'content' in witnessdata:
self.content = witnessdata['content']
# print("Witness "+sigil+" TOKENIZER IS CALLED!")
tokenizer = WordPunctuationTokenizer()
tokens_as_strings = tokenizer.tokenize(self.content)
tokens_as_strings = tokenizer.tokenize(witnessdata['content'])
for token_string in tokens_as_strings:
self._tokens.append(Token({'t':token_string}))
elif 'tokens' in witnessdata:
for tk in witnessdata['tokens']:
self._tokens.append(Token(tk))
# TODO no idea what this content string is needed for.
self.content = ' '.join([x.token_string for x in self._tokens])
else:
raise UnsupportedError("No defined content/tokens in witness "+self.sigil)

def tokens(self):
return self._tokens
Expand Down
135 changes: 69 additions & 66 deletions collatex-pythonport/collatex/core_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,59 +27,59 @@ def collate(collation, output="table", layout="horizontal", segmentation=True, n
# check which output format is requested: graph or table
if output=="graph":
return graph

# create alignment table
table = AlignmentTable(collation, graph, layout)
if collation.pretokenized and not segmentation:
token_list = [[tk.token_data for tk in witness.tokens()] for witness in collation.witnesses]
# only with segmentation=False
# there could be a different comportment of get_tokenized_table if semgentation=True
table = get_tokenized_at(table, token_list, segmentation=segmentation, layout=layout)
# for display purpose, table and html output will return only token 't' (string) and not the full token_data (dict)
if output=="table" or output=="html":
for row in table.rows:
row.cells = [cell["t"] for cell in row.cells]

if output == "json":
return export_alignment_table_as_json(table)
return export_alignment_table_as_json(table, layout=layout)
if output == "html":
return display_alignment_table_as_HTML(table)
if output == "table":
return table
else:
raise Exception("Unknown output type: "+output)



#TODO: this only works with a table output at the moment
#TODO: store the tokens on the graph instead
def collate_pretokenized_json(json, output='table', layout='horizontal', **kwargs):
# Takes more or less the same arguments as collate() above, but with some restrictions.
# Only output types 'json' and 'table' are supported.
if output not in ['json', 'table']:
raise UnsupportedError("Output type" + kwargs['output'] + "not supported for pretokenized collation")
if 'segmentation' in kwargs and kwargs['segmentation']:
raise UnsupportedError("Segmented output not supported for pretokenized collation")
kwargs['segmentation'] = False

# For each witness given, make a 'shadow' witness based on the normalization tokens
# that will actually be collated.
tokenized_witnesses = []
collation = Collation()
for witness in json["witnesses"]:
collation.add_witness(witness)
tokenized_witnesses.append(witness["tokens"])
at = collate(collation, output="table", **kwargs)
tokenized_at = AlignmentTable(collation, layout=layout)
for row, tokenized_witness in zip(at.rows, tokenized_witnesses):
new_row = Row(row.header)
def get_tokenized_at(table, token_list, segmentation=False, layout="horizontal"):
tokenized_at = AlignmentTable(Collation(), layout=layout)
for witness_row, witness_tokens in zip(table.rows, token_list):
new_row = Row(witness_row.header)
tokenized_at.rows.append(new_row)
token_counter = 0
for cell in row.cells:
if cell != "-":
new_row.cells.append(tokenized_witness[token_counter])
token_counter+=1
else:
#TODO: should probably be null or None instead, but that would break the rendering at the moment
new_row.cells.append({"t":"-"})
if output=="json":
return export_alignment_table_as_json(tokenized_at)
if output=="table":
# transform JSON objects to "t" form.
for row in tokenized_at.rows:
row.cells = [cell["t"] for cell in row.cells]
return tokenized_at

def export_alignment_table_as_json(table, indent=None, status=False):
counter = 0
for cell in witness_row.cells:
if cell == "-":
# TODO: should probably be null or None instead, but that would break the rendering at the moment (line 41)
new_row.cells.append({"t" : "-"})
# if segmentation=False
else:
new_row.cells.append(witness_tokens[counter])
counter+=1
# else if segmentation=True
##token_list must be a list of Token instead of list of dict (update lines 34, 64)
##line 41 will not be happy in case of table/html output
#string = witness_tokens[counter].token_string
#token_counter = 1
#while string != cell :
# if counter+token_counter-1 < len(witness_tokens)-1:
# #add token_string of the next token until it is equivalent to the string in the cell
# #if we are not at the last token
# string += ' '+witness_tokens[counter+token_counter].token_string
# token_counter += 1
##there is one list level too many in the output
#new_row.cells.append([tk.token_data for tk in witness_tokens[counter:counter+token_counter]])
#counter += token_counter.
return tokenized_at

def export_alignment_table_as_json(table, indent=None, status=False, layout="horizontal"):
json_output = {}
json_output["table"]=[]
sigli = []
Expand All @@ -92,6 +92,9 @@ def export_alignment_table_as_json(table, indent=None, status=False):
for column in table.columns:
variant_status.append(column.variant)
json_output["status"]=variant_status
if layout=="vertical":
new_table = [[row[i] for row in json_output["table"]] for i in range(len(row.cells))]
json_output["table"] = new_table
return json.dumps(json_output, sort_keys=True, indent=indent)

'''
Expand All @@ -101,29 +104,40 @@ class Collation(object):

@classmethod
def create_from_dict(cls, data, limit=None):
if "witnesses" not in data:
raise UnsupportedError("Json input not valid")
witnesses = data["witnesses"]
collation = Collation()
for witness in witnesses[:limit]:
# generate collation object from json_data
collation.add_witness(witness)
# determine if data is pretokenized
if 'tokens' in witness:
collation.pretokenized = True
return collation

# json input can be a string or a file
@classmethod
def create_from_json_string(cls, json_string):
data = json.loads(json_string)
collation = cls.create_from_dict(data)
return collation

@classmethod
# json_data can be a string or a file
def create_from_json(cls, json_data):
data = json.load(json_data)
def create_from_json_file(cls, json_path):
with open(json_path, 'r') as json_file:
data = json.load(json_file)
collation = cls.create_from_dict(data)
return collation

def __init__(self):
self.witnesses = []
self.pretokenized = False
self.counter = 0
self.witness_ranges = {}
self.combined_string = ""
self.cached_suffix_array = None
self.combined_tokens =[]

# the tokenization process happens multiple times
# and by different tokenizers. This should be fixed
def add_witness(self, witnessdata):
# clear the suffix array and LCP array cache
self.cached_suffix_array = None
Expand All @@ -134,9 +148,11 @@ def add_witness(self, witnessdata):
# the extra one is for the marker token
self.counter += len(witness.tokens()) +2 # $ + number
self.witness_ranges[witness.sigil] = witness_range
if not self.combined_string == "":
self.combined_string += " $"+str(len(self.witnesses)-1)+ " "
self.combined_string += witness.content
if len(self.witnesses) > 1:
self.combined_tokens.append('$')
self.combined_tokens.append(str(len(self.witnesses)-1))
for tk in witness.tokens():
self.combined_tokens.append(tk.token_string)

def add_plain_witness(self, sigil, content):
return self.add_witness({'id':sigil, 'content':content})
Expand All @@ -146,14 +162,11 @@ def get_range_for_witness(self, witness_sigil):
raise Exception("Witness "+witness_sigil+" is not added to the collation!")
return self.witness_ranges[witness_sigil]

def get_combined_string(self):
return self.combined_string

def get_sa(self):
#NOTE: implemented in a lazy manner, since calculation of the Suffix Array and LCP Array takes time
if not self.cached_suffix_array:
# Unit byte is done to skip tokenization in third party library
self.cached_suffix_array = SuffixArray(self.tokens, unit=UNIT_BYTE)
self.cached_suffix_array = SuffixArray(self.combined_tokens, unit=UNIT_BYTE)
return self.cached_suffix_array

def get_suffix_array(self):
Expand All @@ -164,17 +177,7 @@ def get_lcp_array(self):
sa = self.get_sa()
return sa._LCP_values


def to_extended_suffix_array(self):
return ExtendedSuffixArray(self.tokens, self.get_suffix_array(), self.get_lcp_array())

@property
def tokens(self):
#print("COLLATION TOKENIZE IS CALLED!")
#TODO: complete set of witnesses is retokenized here!
tokenizer = WordPunctuationTokenizer()
tokens = tokenizer.tokenize(self.get_combined_string())
return tokens

return ExtendedSuffixArray(self.combined_tokens, self.get_suffix_array(), self.get_lcp_array())


2 changes: 1 addition & 1 deletion collatex-pythonport/collatex/suffix_based_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _get_block_witness(self, witness):
occurrences.append(occurrence)
# sort occurrences on position
sorted_o = sorted(occurrences, key=attrgetter('lower_end'))
block_witness = BlockWitness(sorted_o, self.collation.tokens)
block_witness = BlockWitness(sorted_o, self.collation.combined_tokens)
return block_witness

'''
Expand Down
Loading