Merge pull request #196 from Forced-Alignment-and-Vowel-Extraction/co…

…okbook improved phrase creation recipe
Forced-Alignment-and-Vowel-Extraction · Jun 24, 2024 · b0281d2 · b0281d2
2 parents fa395d3 + 8e8f247
commit b0281d2
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 39 deletions.
diff --git a/docs/cookbook/overlaps.qmd b/docs/cookbook/overlaps.qmd
@@ -20,64 +20,86 @@ atg = AlignedTextGrid(
 
 ## Overlap Detection
 
-We'll get all phones that aren't silences.
+We'll get all phones that aren't silences and create a SequenceList from them.
+SequenceList have convenience attributes to return an array of the start
+and end times of SequenceIntervals within them.
 
 ```{python}
-all_phones = [
-    phone
-    for group in atg
-    for phone in group.Phone
-    if phone.label != ""
-]
+all_phones = SequenceList(
+    *[
+        phone
+        for group in atg
+        for phone in group.Phone
+        if phone.label != ""
+    ]
+)
 ```
 
+Now, we'll loop through these phones of interest. First we'll
+
+- Set every phones "overlapped" feature to False. This will remain false if it is not overlapped.
+- Check for any overlaps with the formula (x_start < y_end) & (y_start < x_end). 
+- This will be true once for all phones (when compared to itself), so if this is true more than once, the interval is overlapped.
+- We'll aslo set an "overlapper" feature which is a list of the intervals that are doing the overlapping.
 
-```{python}
-starts = np.array([
-    phone.start
-    for phone in all_phones
-])
-
-ends = np.array([
-    phone.end
-    for phone in all_phones
-])
-```
 
 ```{python}
 for phone in all_phones:
+
     self_index = all_phones.index(phone)
     phone.set_feature("overlapped", False)
 
-    overlap = (phone.start < ends) & (starts < phone.end)
-    overlappers = np.argwhere(overlap).squeeze().tolist()
+    overlap = (phone.start < all_phones.ends) & \
+              (all_phones.starts < phone.end)
+
+    overlappers = np.argwhere(overlap) \
+                    .squeeze() \
+                    .tolist()
     
 
     if overlap.sum() > 1:
         overlappers.remove(self_index)
-        overlapper_list = [all_phones[idx] for idx in overlappers]
+        overlapper_list = [
+            all_phones[idx] 
+            for idx in overlappers
+        ]
 
         phone.set_feature("overlapped", True)
-        phone.set_feature("overlapper", SequenceList(*overlapper_list))
+        phone.set_feature(
+            "overlapper", 
+            SequenceList(*overlapper_list)
+        )
 ```
 
+Let's grab one of the overlapped phones.
+
 ```{python}
-#| echo: false
-overlapped_phones = [phone for phone in all_phones if phone.overlapped]
+overlapped_phones = [
+    phone 
+    for phone in all_phones 
+    if phone.overlapped
+]
+
+one_phone = overlapped_phones[0]
+```
+
+We can inspect its timing and compare it to the overlappers.
 
+```{python}
+#| code-fold: true
 print(
-    f"Overlapped: {(overlapped_phones[0].start, overlapped_phones[0].end, overlapped_phones[0].label)}"
+    f"Overlapped: {(one_phone.start, one_phone.end, one_phone.label)}"
 )
 
 print(
-    f"Overlapped word: {overlapped_phones[0].within.label}"
+    f"Overlapped word: {one_phone.within.label}"
 )
 
 print(
-    f"Overlappers: {(overlapped_phones[0].overlapper.starts, overlapped_phones[0].overlapper.ends, overlapped_phones[0].overlapper.labels)}"
+    f"Overlapper: {one_phone.overlapper.starts, one_phone.overlapper.ends, one_phone.overlapper.labels}"
 )
 
 print(
-    f"Overlapper words: {[x.within.label for x in overlapped_phones[0].overlapper]}"
+    f"Overlapper words: {[x.within.label for x in one_phone.overlapper]}"
 )
 ```
diff --git a/docs/cookbook/phrase_creation.qmd b/docs/cookbook/phrase_creation.qmd
@@ -48,32 +48,77 @@ def make_phrase_label(a_label, b_label):
 
 The `fuse_rightwards()` method will fuse the following interval to the current interval and pop the following interval from the tier. Therfore, we don't want to use a `for`-loop.
 
-Instead, we'll use a `while` loop, which will end when we reach the end of the `Phrase` (when `fol.label == #`). We'll update the interval we are fusing with when
+Instead, we'll use a `while` loop, which will end when we reach the end of the `Phrase` tier. We'll update the interval we are fusing with when
 
 - Its current interval label is "" (or a pause)
 - The following interval label is "" and longer than 220 ms.
 
 The `continue` keyword under the `if` statements bumps us back to the top of the `while` loop, which will check to see if we're at the end of the Phrase tier.
 
 ```{python}
-this_interval = atg[0].Phrase.first
+this_interval = atg[0].Phrase.first    # <1>
 
-while this_interval.fol.label != "#":
+while this_interval is not atg[0].Phrase.last: # <2>
 
-    if this_interval.label == "":
-        this_interval = this_interval.fol
-        continue
+    if this_interval.label == "":           # <3>
+        this_interval = this_interval.fol   # <3>
+        continue                            # <3>
     
-    if this_interval.fol.label == "" and this_interval.fol.duration > 0.220:
-        this_interval = this_interval.fol
-        continue
-
-    this_interval.fuse_rightwards(label_fun = make_phrase_label)
+    following_long_pause = (                # <4>
+        this_interval.fol.label == ""       # <4>
+        and                                 # <4>
+        this_interval.fol.duration >= 0.220 # <4>
+    )                                       # <4>
+
+    if following_long_pause:                # <5> 
+        this_interval = this_interval.fol   # <5>
+        continue                            # <5>
+
+    this_interval.fuse_rightwards(          # <6>
+        label_fun = make_phrase_label       # <6>
+    )                                       # <6>
 ```
+1. Manually begin at the first interval.
+2. The value of `.last` is dynamically updated, so this is safe.
+3. If we are *currently* in a pause interval, move to the next interval.
+4. Get a `True` or `False` if the next interval is a pause equal to or greater than 220ms.
+5. If the following interval is a long pause, update `this_interval` to be the following interval. The previous `if` statement will keep bumping us along until we get to a non-pause interval. 
+6. If neither of the previous `if` statements were triggered, we fuse `this_interval` with the following interval.
 
 We can check on the results.
 
 ```{python}
 for phrase in atg[0].Phrase[0:10]:
     print(phrase.label)
+```
+
+And just for clarity, each non-pause word is now a subset member of a phrase interval.
+
+```{python}
+(
+    atg[0].Word[1].label,
+    atg[0].Word[1].within.label
+)
+```
+
+## More ideas
+
+We can also, for example, get a list of the duration of pauses that occur *within* a phrase.
+
+```{python}
+import numpy as np
+
+in_phrase_pauses = [
+    interval
+    for interval in atg[0].Word
+    if interval.label == ""
+    if interval.within.label != ""
+]
+
+pause_durs = np.array([
+    interval.duration
+    for interval in in_phrase_pauses
+])
+
+pause_durs
 ```