-
Notifications
You must be signed in to change notification settings - Fork 15
Add Boundaries Automatically
The <boundary/>
element in a FML-APML document allows to specify ECA’s intonation and non verbal behaviours. The placement and the attributes of this element depends on parameters such as the punctuation marks of the sentence (for more information, see reference steedman_2004 Carolis et al., 2004). In order to include <boundary/>
elements in the <speech/>
elements of your FML-APML files, you can use the python script addBoundaries.py. It automatically adds <boundary/>
elements in relation to the punctuation marks of a sentence contained in a <speech/>
element. The table of correspondences between punctuation marks and boundary attributes is shown (see Table of correspondences). The script addBoundaries.py takes as input a single file, or a directory including several files.
The script addBoundaries.py only processes input files with an .xml extension, that conforms to the FML-APML dtd. Although it is strongly recommended, there is no obligation that your XML input files contain a reference to the fml-apml.dtd document. The input files must contain time marker elements <tm/>
at the following locations:
- before the text contained in a
<speech/>
element - after the text contained in a
<speech/>
element - after each punctuation mark included in the text in a
<speech/>
element
<boundary/>
elements will be written by the script addBoundaries.py according to these <tm/>
elements.
You can find the python script : addBoundaries.py
#!/usr/bin/env python
"""addBoundaries.py: this script automatically adds boundary elements according to punctuation marks in a fml-apml document type."""
from xml.dom.minidom import parse
from xml.dom.minidom import parseString
import re
import fnmatch
import sys, getopt
import os
from os.path import isfile
from os import walk
import codecs
__author__ = "Sabrina Campano"
def usage():
""" Print script usage."""
print 'usage: addBoundaries.py [-r] <inputPath> <outputDir> '
def parseArgs(argv):
""" Parse script arguments. """
global replace
replace = False
try:
opts, args = getopt.getopt(argv,"hr",["help", "replaceExistingBoundaries"])
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit()
elif opt in ("-r", "--replaceExistingBoundaries"):
replace = True
if len(args) < 2:
print 'Error: wrong number of arguments'
usage()
sys.exit(2)
inputPath = args[0]
outputDir = args[1]
if (inputPath == outputDir):
print "Error: input path and output directory cannot be the same."
sys.exit(2)
if not os.path.isdir(inputPath) and not os.path.isfile(inputPath):
print "Error: directory " + inputPath + " not found"
sys.exit(2)
print 'Input path is: ', inputPath
print 'Output dir is: ', outputDir
print 'Replace existing boundaries: ', "yes" if replace else "no"
return inputPath, outputDir
def addBoundary(dom, speechElement, punct, nextTimeMarker):
""" Add a boundary as a child node in a speech element issued from dom data. Boundary attributes are set according
to a punctuation mark, and to the time marker element that follows the punctuation mark.
Keyword arguments:
dom -- dom data
speechElement -- speech element in which the boundary has to be added
punct -- punctuation mark
nextTimeMarker -- that marker element that follows the punctuation mark
"""
global boundaryCount
# Define boundary type.
boundaryType = ''
if punct == ',' :
boundaryType = "LH"
elif punct == '.' :
boundaryType = "LL"
else:
boundaryType = "HH"
# Get speech element id.
speechId = speechElement.getAttribute("id")
if speechId == None:
speechId = ""
# Get time marker id.
nextTimeMarkerId = nextTimeMarker.getAttribute("id")
# Create boundary node.
boundaryNode = dom.createElement("boundary")
boundaryNode.setAttribute("id", "b" + str(boundaryCount))
boundaryNode.setAttribute("type", boundaryType)
boundaryNode.setAttribute("start", speechId +":"+ nextTimeMarkerId)
boundaryNode.setAttribute("end", speechId +":"+ nextTimeMarkerId + "+0.5")
# Add boundary node as child to speech element.
speechElement.appendChild(boundaryNode)
boundaryCount+=1
def main(argv):
# Regular expressions to filter files by extension.
includes = ['*.xml']
includes = r'|'.join([fnmatch.translate(x) for x in includes])
inputpath, outputDir = parseArgs(argv)
if isfile(inputpath) and re.match(includes, inputpath):
outputPath = os.path.join(outputDir, os.path.basename(inputpath))
processFile(inputpath, outputPath)
else:
for (root, dirs, files) in walk(inputpath):
files = [f for f in files if re.match(includes, f)]
for name in files:
inputFile = os.path.join(root, name) # File to process.
outputPath = os.path.join(outputDir, os.path.relpath(root, inputpath), name) # File output path.
processFile(inputFile, outputPath)
def writeDom(dom, outputFile):
""" Write dom data in a file. """
if not os.path.exists(os.path.dirname(outputFile)):
os.makedirs(os.path.dirname(outputFile))
cleanDom = '\n'.join([line for line in dom.toprettyxml(indent='\t').split('\n') if line.strip()]) # Remove blank lines.
f = codecs.open(outputFile, "w", encoding='utf-8')
try:
f.write(cleanDom)
finally:
f.close()
def processFile(inputFile, outputFile):
""" Add a boundary in the output file when a punctuation mark is found in a speech element of the input file.
No boundary is added in the speech element when an existing boundary is found and replace option is False. """
# Counter for the number of boundaries in a single file. Used to set boundary ids.
global boundaryCount
# Option for the replacement of existing boundaries.
global replace
print "\nProcessing file: ", inputFile, "..."
dom = parse(inputFile)
boundaryCount = 0
for speech in dom.getElementsByTagName("speech"):
existingBoundaries = speech.getElementsByTagName("boundary")
if len(existingBoundaries) > 0:
if not replace:
print "Boundaries found in a speech element from input file. No boundary will be added in this speech element."
continue
else:
for boundaryElement in existingBoundaries: # Remove existing boundary elements.
speech.removeChild(boundaryElement)
print "Boundaries found in speech element of input file. These boundaries were removed from output file."
for child in speech.childNodes:
if child.nodeType == 3: # If child is a TextNode.
text = child.nodeValue.strip()
res = re.findall('[\.\?\!,]', text) # Find punctuation marks in text.
if len(res) > 0:
punct = res[len(res)-1] # Get last punctuation mark by default.
nextSibling = child.nextSibling
if nextSibling != None and nextSibling.tagName == "tm": # If a time marker element follows punctuation mark.
addBoundary(dom, speech, punct, nextSibling)
# Print message with number of boundaries added.
writeDom(dom, outputFile)
print "File ", outputFile," written - ", boundaryCount, "boundaries added."
if __name__ == "__main__":
main(sys.argv[1:])
python addBoundaries.py –r myInputPath myOutputDir
Inputs
- myInputPath: a file name, or an input directory containing the files to process (ex: sayHello.xml or FMLDirectory) .
-
myOutputDir: output directory, where all the processed files will be written (ex:
Results
)
- By default, if a
<speech/>
element in an input file contains a<boundary/>
element, the<boundary/>
elements of this<speech/>
element won’t be replaced. If the –r option is used, then the<boundary/>
elements contained in the input files will be replaced in the output files when a punctuation mark is found.
- A directory that contains the processed files. The input files without an .xml extension are not copied in the output directory.
Input file:
<?xml version="1.0" encoding="ISO-8859-1" ?>
<fml-apml>
<bml id="04c7bd0b-f447-4111-889e-cb823c5eadc9" agent="Greta">
<speech id="s1" language="english" start="0.0" voice="marytts">
<tm id="tm1"/>
The REVERIE project is an FP7 european project.
<tm id="tm2"/>
The resulting application of this project is a wide online application where users,
<tm id="tm3"/>
can connect with their friends and share rich experiences.
<tm id="tm4"/>
<pitchaccent end="s1:tm2" id="pa1" importance="1" level="medium" start="s1:tm1" type="HStar" />
</speech>
</bml>
<fml>
<performative id="p1" type="inform" end="s1:tm2" start="s1:tm1" importance="1"/>
<performative id="p2" type="inform" end="s1:tm4" start="s1:tm3" importance="1"/>
</fml>
</fml-apml>
Command:
python addBoundaries.py –r reverie.xml Results
Output file written in the Results directory :
<?xml version="1.0" ?>
<fml-apml>
<bml agent="Greta" id="04c7bd0b-f447-4111-889e-cb823c5eadc9">
<speech id="s1" language="english" start="0.0" voice="marytts">
<tm id="tm1"/>
The REVERIE project is an FP7 european project.
<tm id="tm2"/>
The resulting application of this project is a wide online application where users,
<tm id="tm3"/>
can connect with their friends and share rich experiences.
<tm id="tm4"/>
<pitchaccent end="s1:tm2" id="pa1" importance="1" level="medium" start="s1:tm1" type="HStar"/>
<boundary end="s1:tm2+0.5" id="b0" start="s1:tm2" type="HH"/>
<boundary end="s1:tm3+0.5" id="b1" start="s1:tm3" type="LH"/>
<boundary end="s1:tm4+0.5" id="b2" start="s1:tm4" type="LL"/>
</speech>
</bml>
<fml>
<performative end="s1:tm2" id="p1" importance="1" start="s1:tm1" type="inform"/>
<performative end="s1:tm4" id="p2" importance="1" start="s1:tm3" type="inform"/>
</fml>
</fml-apml>
-
punctuation mark boundary attribute , LH . LL ! or ? HH Berardina De Carolis, Catherine Pelachaud, Isabella Poggi, Mark Steedman. APML, a markup language for believable behavior generation. 2004.
Advanced
- Generating New Facial expressions
- Generating New Gestures
- Generating new Hand configurations
- Torso Editor Interface
- Creating an Instance for Interaction
- Create a new virtual character
- Creating a Greta Module in Java
- Modular Application
- Basic Configuration
- Signal
- Feedbacks
- From text to FML
- Expressivity Parameters
- Text-to-speech, TTS
-
AUs from external sources
-
Large language model (LLM)
-
Automatic speech recognition (ASR)
-
Extentions
-
Integration examples
Nothing to show here