From b6495f2cf2d41f744ab29a405dd5629313ef58cb Mon Sep 17 00:00:00 2001
From: Christian <ctb77@stanford.edu>
Date: Thu, 14 May 2020 01:18:15 -0700
Subject: [PATCH] WIP: Modularize FAAValign

Most of the pieces fit together now. The main pipeline works up
until the actual alignment.
---
 align/FAAValign2.py          |   2 +-
 align/cmudictionary.py       | 151 ++++++++++++++++++-----------------
 align/praat.py               |  14 ++--
 align/transcriptprocessor.py |  28 ++++---
 4 files changed, 101 insertions(+), 94 deletions(-)

diff --git a/align/FAAValign2.py b/align/FAAValign2.py
index 2e1ab73..625765a 100644
--- a/align/FAAValign2.py
+++ b/align/FAAValign2.py
@@ -56,7 +56,6 @@ def main(args):
 	aligner = Aligner(*args)
 	aligner.read_transcript()
 	aligner.check_transcript()
-	# works up to here
 	aligner.check_against_dictionary()
 
 	if check:
@@ -65,6 +64,7 @@ def main(args):
 	aligner.check_tempdir('')
 	main_textgrid = praat.TextGrid()
 	duration = aligner.get_duration()
+	# works up to here
 	aligner.align()
 
 
diff --git a/align/cmudictionary.py b/align/cmudictionary.py
index 767d3f9..6febda9 100644
--- a/align/cmudictionary.py
+++ b/align/cmudictionary.py
@@ -33,43 +33,47 @@ def __init__(self, dictionary_file, *args, **kwargs):
         ## check that cmudict has entries
         if len(self.cmu_dict) == 0:
             self.logger.warning('Dictionary %s is empty' % dictionary_file)
-        self.logger.debug("Read dictionary from file %s" % dictionary_file)
+        self.logger.debug("End initialization.")
 
     def __config_flags( self, *args, **kwargs ):
+        self.logger.debug('Reading config flags')
         self.verbose = False
         self.prompt = False
         self.check = False
         try:
             self.verbose = kwargs['verbose']
         except KeyError:
-            pass
+            self.logger.debug('No verbose argument; default to false.')
         try:
             self.prompt = kwargs['prompt']
         except KeyError:
-            pass
+            self.logger.debug('No prompt argument; default to false.')
         try:
             self.check = kwargs['check']
         except KeyError:
-            pass
+            self.logger.debug('No check argument; default to false.')
 
     def read(self,dictionary_file):
         """
         @author Keelan Evanini
         """
+        self.logger.info(f'Reading dictionary from {dictionary_file}')
         cmu_dict = {}
         pat = re.compile('  *')                ## two spaces separating CMU dict entries
-		# CMU dictionary should be converted to a unicode format
+        # CMU dictionary should be converted to a unicode format
         with open(dictionary_file,'r', encoding="latin1") as cmu_dict_file:
             for line in cmu_dict_file.readlines():
                 line = line.rstrip()
                 line = re.sub(pat, ' ', line)      ## reduce all spaces to one
+                self.logger.debug(f'Dictionary line: {line}')
                 word = line.split(' ')[0]          ## orthographic transcription
+                self.logger.debug(f'Word: {str(word)}')
                 phones = line.split(' ')[1:]       ## phonemic transcription
+                self.logger.debug(f'Phones: {str(phones)}')
                 if word not in cmu_dict:
-                    cmu_dict[word] = [phones]       ## phonemic transcriptions represented as list of lists of phones
-                else:
-                    if phones not in cmu_dict[word]:
-                        cmu_dict[word].append(phones)   ## add alternative pronunciation to list of pronunciations
+                    cmu_dict[word] = []
+                if phones not in cmu_dict[word]:
+                    cmu_dict[word].append(phones)   ## add pronunciation to list of pronunciations
         return cmu_dict
 
     def add_dictionary_entries(self, infile, path='.'):
@@ -174,7 +178,30 @@ def check_phone(self, phone, transcription, index):
         else:
             raise ValueError("Unknown phone %s (at position %i) in word %s!\n" % (phone, index+1, transcription))
 
-    def check_word(self,word, next_word='', unknown={}, line=''):
+    def __check_word(self, word, next_word):
+        """A rewrite of check word
+        returns bool
+        """
+
+        if word.upper() in self.cmu_dict:
+            return True
+        self.logger.info(f'Cannot find {word} in dictionary')
+        if self.intended.search(next_word):
+            self.logger.debug(f'Hint given: {next_word}')
+            if next_word in self.cmu_dict:
+                self.logger.info(f'Clue is in dictionary')
+                if self.check:
+                    self.logger.debug(
+                        'Running in check mode, returning false so transcript can be checked')
+                    return False
+                else:
+                    return True
+        else:
+            self.logger.debug('No hint given')
+            return False
+
+
+    def check_word(self,word, next_word='', unknown=None, line=''):
         """checks whether a given word's phonetic transcription is in the CMU dictionary;
         adds the transcription to the dictionary if not"""
         ## INPUT:
@@ -183,95 +210,68 @@ def check_word(self,word, next_word='', unknown={}, line=''):
         ## OUTPUT:
         ## dict unknown = unknown or truncated words (needed if "check transcription" option is selected; remains empty otherwise)
         ## - modifies CMU dictionary (dict cmudict)
-        cmudict = self.cmu_dict
+        if type(unknown) is not dict:
+            unknown = {}
 
-        clue = ''
+        self.logger.info(f'Checking if \'{word}\' in dictionary')
+        if self.__check_word(word, next_word):
+            inDict = True
+        else:
+            inDict = False
 
-        ## dictionary entry for truncated words may exist but not be correct for the current word
-        ## (check first because word will be in CMU dictionary after procedure below)
-        if self.truncated.search(word) and word in cmudict:
-            ## check whether following word is "clue" word?
-            if self.intended.search(next_word):
-                clue = next_word
-            ## do not prompt user for input if "check transcription" option is selected
-            ## add truncated word together with its proposed transcription to list of unknown words
-            ## (and with following "clue" word, if present)
-            if self.check:
-                if clue:
-                    unknown[word] = (cmudict[word], clue.lstrip('+'), line)
-                else:
-                    unknown[word] = (cmudict[word], '', line)
-            ## prompt user for input
-            else:
-                self.logger.warning("Unknown truncation")
-                # Should not prompt user, but behavior should be caught
-                # and handled by user-facing FAAValign2.py
-                """
-                self.logger.debug("Dictionary entry for truncated word %s is %s." % (word, cmudict[word]))
-                if clue:
-                    self.logger.debug("Following word is %s." % next_word)
-                correct = raw_input("Is this correct?  [y/n]")
-                if correct != "y":
-                    transcription = prompt_user(word, clue)
-                    cmudict[word] = [transcription]
-                """
+        cmudict = self.cmu_dict
+        clue = next_word.strip().lstrip('+').upper()
 
-        elif word not in cmudict and word not in self.STYLE_ENTRIES:
-            ## truncated words:
-            if self.truncated.search(word):
-                ## is following word "clue" word?  (starts with "+")
-                if self.intended.search(next_word):
-                    clue = next_word
+        if not inDict and word not in self.STYLE_ENTRIES:
             ## don't do anything if word itself is a clue word
-            elif self.intended.search(word):
+            if '+' in word:
                 return unknown
             ## don't do anything for unclear transcriptions:
-            elif word == '((xxxx))':
+            if word == '((xxxx))':
                 return unknown
             ## uncertain transcription:
-            elif self.start_uncertain.search(word) or self.end_uncertain.search(word):
+            if self.start_uncertain.search(word) or self.end_uncertain.search(word):
                 if self.start_uncertain.search(word) and self.end_uncertain.search(word):
                     word = word.replace('((', '')
                     word = word.replace('))', '')
                     ## check if word is in dictionary without the parentheses
-                    self.check_word(word, '', unknown, line)
-                    return unknown
+                    if not self.__check_word(word,''):
+                        return unknown
                 else:  ## This should not happen!
                     error= "ERROR!  Something is wrong with the transcription of word %s!" % word
                     raise ValueError(error)
             ## asterisked transcriptions:
-            elif word and word[0] == "*":
+            elif word[0] == "*":
                 ## check if word is in dictionary without the asterisk
-                self.check_word(word[1:], '', unknown, line)
-                return unknown
+                if not self.__check_word(word[1:],''):
+                    return unknown
             ## generate new entries for "-in'" words
-            if self.ing.search(word):
-                gword = self.ing.sub("ING", word)
+            if word[-3:].upper() == "IN'":
+                gword = word[:-1].upper()+'G'
                 ## if word has entry/entries for corresponding "-ing" form:
-                if gword in cmudict:
+                if self.__check_word(gword, ''):
                     for t in cmudict[gword]:
                         ## check that transcription entry ends in "- IH0 NG":
-                        if t[-1] == "NG" and t[-2] == "IH0":
-                            tt = t
-                            tt[-1] = "N"
-                            tt[-2] = "AH0"
-                            if word not in cmudict:
-                                cmudict[word] = [tt]
-                            else:
-                                cmudict[word].append(tt)
+                        if t[-2:] == ["IH0", "NG"]:
+                            new_transcription = t[:-2]
+                            new_transcription[-2] = "AH0"
+                            new_transcription[-1] = "N"
+                            if not inDict:
+                                self.cmu_dict[word] = []
+                            if new_transcription not in cmudict[gword]:
+                                self.cmu_dict[word].append(new_transcription)
                     return unknown
             ## if "check transcription" option is selected, add word to list of unknown words
-            if self.check:
-                if clue:
-                    unknown[word] = ("", clue.lstrip('+'), line)
-                else:
-                    unknown[word] = ("", "", line)
-                self.logger.warning("Unknown word %s : %s." % (word.encode('ascii', 'replace'), line.encode('ascii', 'replace')))
-
-            ## otherwise, promput user for Arpabet transcription of missing word
+            if not inDict:
+                self.logger.warning(f"Unknown word '{word}' in line '{line}'")
+                unknown[word] = ("", clue.lstrip('+'), line)
+                return unknown
+        if word in self.STYLE_ENTRIES:
+            self.logger.info(f"Style entry: {word}")
+        elif inDict:
+            self.logger.debug(f"Entry found")
         else:
             self.logger.warning("No transcription for "+word)
-
         return unknown
 
     def merge_dicts(self, d1, d2):
@@ -297,7 +297,8 @@ def write_dict(self, fname, dictionary=None):
             dictionary = self.cmu_dict
         out_string = ''
         ## sort dictionary before writing to file
-        keys = dictionary.keys()
+        keys = list(dictionary)
+        # self.logger.debug(keys)
         keys.sort()
         for word in keys:
             ## make a separate entry for each pronunciation in case of alternative entries
diff --git a/align/praat.py b/align/praat.py
index 3ab1e5f..03960e8 100755
--- a/align/praat.py
+++ b/align/praat.py
@@ -151,7 +151,7 @@ class LPC:
 
     def __init__(self):
         self.logger = logging.getLogger(__name__)
-        self.logger.basicConfig(
+        logging.basicConfig(
             format='%(levelname)s:%(message)s',
             level=logging.DEBUG)
         self.__times = []
@@ -223,7 +223,7 @@ class MFCC:
 
     def __init__(self):
         self.logger = logging.getLogger(__name__)
-        self.logger.basicConfig(
+        logging.basicConfig(
             format='%(levelname)s:%(message)s',
             level=logging.DEBUG)
 
@@ -299,7 +299,7 @@ class TextGrid:
 
     def __init__(self, name=''):
         self.logger = logging.getLogger(__name__)
-        self.logger.basicConfig(
+        logging.basicConfig(
             format='%(levelname)s:%(message)s',
             level=logging.DEBUG)
 
@@ -523,7 +523,7 @@ class IntervalTier:
 
     def __init__(self, name='', xmin=0, xmax=0):
         self.logger = logging.getLogger(__name__)
-        self.logger.basicConfig(
+        logging.basicConfig(
             format='%(levelname)s:%(message)s',
             level=logging.DEBUG)
 
@@ -672,7 +672,7 @@ class PointTier:
 
     def __init__(self, name='', xmin=0, xmax=0):
         self.logger = logging.getLogger(__name__)
-        self.logger.basicConfig(
+        logging.basicConfig(
             format='%(levelname)s:%(message)s',
             level=logging.DEBUG)
 
@@ -744,7 +744,7 @@ class Interval:
 
     def __init__(self, xmin=0, xmax=0, mark=''):
         self.logger = logging.getLogger(__name__)
-        self.logger.basicConfig(
+        logging.basicConfig(
             format='%(levelname)s:%(message)s',
             level=logging.DEBUG)
 
@@ -778,7 +778,7 @@ class Point:
 
     def __init__(self, time, mark):
         self.logger = logging.getLogger(__name__)
-        self.logger.basicConfig(
+        logging.basicConfig(
             format='%(levelname)s:%(message)s',
             level=logging.DEBUG)
 
diff --git a/align/transcriptprocessor.py b/align/transcriptprocessor.py
index c7add0c..001a21b 100644
--- a/align/transcriptprocessor.py
+++ b/align/transcriptprocessor.py
@@ -93,7 +93,7 @@ def check_dictionary_entries(self, wavfile):
         ## OUTPUT:  list newlines = list of list of words for each line (processed)
         ## - prompts user to modify CMU dictionary (cmudict) and writes updated version of CMU dictionary to file
         ## - if "check transcription" option is selected, writes list of unknown words to file and exits
-
+        self.logger.debug('Checking dictionary entries')
         newlines = []
         unknown = {}
 
@@ -151,6 +151,8 @@ def preprocess_transcription(self, line):
         ## INPUT:  string line = line of orthographic transcription
         ## OUTPUT:  list words = list of individual words in transcription
 
+        self.logger.info("Preprocessing transcript line")
+        self.logger.debug(line)
         flag_uncertain = self.flag_uncertain
         last_beg_uncertain = self.last_beg_uncertain
         last_end_uncertain = self.last_end_uncertain
@@ -185,31 +187,35 @@ def preprocess_transcription(self, line):
 
         ## split line into words:
         words = line.split()
+        self.logger.debug(words)
 
         ## add uncertainty parentheses around every word individually
         newwords = []
         for word in words:
+            self.logger.debug(word)
+            self.logger.debug(flag_uncertain)
             if word == "((":        ## beginning of uncertain transcription span
-                if not flag_uncertain:
+                if not self.flag_uncertain:
                     self.flag_uncertain = True
                     self.last_beg_uncertain = original_line
                 else:
                     msg = "Beginning of uncertain transcription span detected twice in a row\n"
-                    msg += ("Please close the the opening double parenthesis in line %s." % last_beg_uncertain)
+                    msg += ("Please close the the opening double parenthesis in line '%s'" % last_beg_uncertain)
                     raise ValueError( msg )
-            elif word == "))":      ## end of uncertain transcription span
-                if flag_uncertain:
+                continue
+            if word == "))":      ## end of uncertain transcription span
+                if self.flag_uncertain:
                     self.flag_uncertain = False
                     self.last_end_uncertain = original_line
                 else:
                     msg = "End of uncertain transcription span detected twice in a row\n"
-                    msg += "No opening double parentheses for line %s." % original_line
+                    msg += "No opening double parentheses for line %s." % last_end_uncertain
                     raise ValueError( msg )
-            else:  ## process words
-                if flag_uncertain:
-                    newwords.append("((" + word + "))")
-                else:
-                    newwords.append(word)
+                continue
+            if self.flag_uncertain:
+                newwords.append("((" + word + "))")
+            else:
+                newwords.append(word)
 
         return newwords