diff --git a/corpustools/corpus/io/csv.py b/corpustools/corpus/io/csv.py index 34b7835e..125fe4a5 100644 --- a/corpustools/corpus/io/csv.py +++ b/corpustools/corpus/io/csv.py @@ -87,6 +87,75 @@ def inspect_csv(path, num_lines = 10, coldelim = None, transdelim = None): return atts, best +def check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types=None, feature_system_path=None, + stop_check=None, call_back=None): + + if feature_system_path is not None and os.path.exists(feature_system_path): + feature_matrix = load_binary(feature_system_path) + feature_matrix = modernize.modernize_specifier(feature_matrix) + + if annotation_types is None: + annotation_types, delimiter = inspect_csv(path, coldelim=delimiter) + + for a in annotation_types: + a.reset() + + missing = set() + + with open(path, encoding='utf-8-sig') as f: + headers = f.readline() + headers = headers.split(delimiter) + if len(headers) == 1: + e = DelimiterError(('Could not parse the corpus.\n\Check that the column delimiter you typed in matches ' + 'the one used in the file.')) + raise e + headers = annotation_types + + for line in f.readlines(): + line = line.strip() + if not line: + continue + + for k, v in zip(headers, line.split(delimiter)): + v = v.strip() + if k.attribute.att_type == 'tier': + ignored = k.ignored_characters + if ignored is not None: + v = ''.join(x for x in v if x not in ignored) + + sd = k.syllable_delimiter + if sd is not None: + syllables = v.split(sd) + else: + syllables = [v] + + td = k.trans_delimiter + stress_spec = set(k.stress_specification.keys()) + tone_spec = set(k.tone_specification.keys()) + supra_spec = stress_spec.union(tone_spec) + for syllable in syllables: + syllable = ''.join(x for x in syllable if x not in supra_spec) + + if td is None: + if k.digraph_pattern is not None: + string = k.digraph_pattern.findall(syllable) + else: + string = [x for x in syllable] + else: + string = syllable.split(td) + + for seg in string: + if seg == '': + continue + + if seg not in feature_matrix.segments: + missing.add(seg) + + print('In csv.py', missing) + + + + def load_corpus_csv(corpus_name, path, delimiter, annotation_types = None, feature_system_path = None, @@ -117,6 +186,9 @@ def load_corpus_csv(corpus_name, path, delimiter, Corpus object generated from the text file """ + check_feature_coverage_csv(corpus_name, path, delimiter, annotation_types, feature_system_path, + stop_check, call_back) + corpus = Corpus(corpus_name) if feature_system_path is not None and os.path.exists(feature_system_path): feature_matrix = load_binary(feature_system_path) @@ -171,7 +243,6 @@ def load_corpus_csv(corpus_name, path, delimiter, d[k.attribute.name] = (k.attribute, v) word = Word(**d) - # TODO: what is the following code doing? if word.transcription: #transcriptions can have phonetic symbol delimiters if not word.spelling: diff --git a/corpustools/corpus/io/helper.py b/corpustools/corpus/io/helper.py index 52c6dfca..b5cc0784 100644 --- a/corpustools/corpus/io/helper.py +++ b/corpustools/corpus/io/helper.py @@ -71,22 +71,22 @@ def __init__(self, syllable, feature_matrix, annotation_type, begin=None, end=No self.tone = symbol else: seg += symbol - - for i, j in enumerate(feature_matrix.features): - if j == 'syllabic': - index_for_syllabic = i + 1 - try: - if feature_matrix.seg_to_feat_line(seg)[index_for_syllabic] == "-": # not syllabic - if is_nucleus: - self.coda.append(BaseAnnotation(seg)) - else: - self.onset.append(BaseAnnotation(seg)) - else: # syllabic - is_nucleus = True - self.nucleus.append(BaseAnnotation(seg)) - except KeyError as e: - e = MissingFeatureError('The feature values for {} is not specified.'.format(seg)) - raise e + + for i, j in enumerate(feature_matrix.features): + if j == 'syllabic': + index_for_syllabic = i + 1 + try: + if feature_matrix.seg_to_feat_line(seg)[index_for_syllabic] == "-": # not syllabic + if is_nucleus: + self.coda.append(BaseAnnotation(seg)) + else: + self.onset.append(BaseAnnotation(seg)) + else: # syllabic + is_nucleus = True + self.nucleus.append(BaseAnnotation(seg)) + except KeyError as e: + e = MissingFeatureError('The feature values for {} is not specified.'.format(seg)) + raise e def __iter__(self): segs = list() @@ -502,8 +502,9 @@ def parse_transcription(string, annotation_type, feature_matrix=None, corpus=Non #final_string = [] - corpus.inventory.stress_types = annotation_type.stress_specification - corpus.inventory.tone_types = annotation_type.tone_specification + if corpus is not None: + corpus.inventory.stress_types = annotation_type.stress_specification + corpus.inventory.tone_types = annotation_type.tone_specification #sd = annotation_type.syllable_delimiter diff --git a/setup.py b/setup.py index f5fc4b97..88250b2d 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ def run_tests(self): 'numpy', 'scipy', 'textgrid' + 'pyqt' #'python-acoustic-similarity' ], entry_points = {