[mimictts] Initial contribution (openhab#13045)

* [mimictts] Initial contribution Add a voice service for Mimic (version 3) TTS from Mycroft AI. Signed-off-by: Gwendal Roulleau <[email protected]>
nemerdaud · Feb 28, 2023 · cd0b758 · cd0b758
1 parent d8b1368
commit cd0b758
Show file tree

Hide file tree

Showing 14 changed files with 558 additions and 0 deletions.
diff --git a/CODEOWNERS b/CODEOWNERS
@@ -391,6 +391,7 @@
 /bundles/org.openhab.voice.googletts/ @gbicskei
 /bundles/org.openhab.voice.mactts/ @kaikreuzer
 /bundles/org.openhab.voice.marytts/ @kaikreuzer
+/bundles/org.openhab.voice.mimictts/ @dalgwen
 /bundles/org.openhab.voice.actiontemplatehli/ @GiviMAD
 /bundles/org.openhab.voice.picotts/ @FlorianSW
 /bundles/org.openhab.voice.pollytts/ @hillmanr

diff --git a/bom/openhab-addons/pom.xml b/bom/openhab-addons/pom.xml
@@ -1956,6 +1956,11 @@
       <artifactId>org.openhab.voice.marytts</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.openhab.addons.bundles</groupId>
+      <artifactId>org.openhab.voice.mimictts</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.openhab.addons.bundles</groupId>
       <artifactId>org.openhab.voice.actiontemplatehli</artifactId>

diff --git a/bundles/org.openhab.voice.mimictts/NOTICE b/bundles/org.openhab.voice.mimictts/NOTICE
@@ -0,0 +1,13 @@
+This content is produced and maintained by the openHAB project.
+
+* Project home: https://www.openhab.org
+
+== Declared Project Licenses
+
+This program and the accompanying materials are made available under the terms
+of the Eclipse Public License 2.0 which is available at
+https://www.eclipse.org/legal/epl-2.0/.
+
+== Source Code
+
+https://github.com/openhab/openhab-addons
diff --git a/bundles/org.openhab.voice.mimictts/README.md b/bundles/org.openhab.voice.mimictts/README.md
@@ -0,0 +1,49 @@
+# Mimic Text-to-Speech
+
+Mimic (version 3 and above) is an offline open source Text-To-speech engine designed by Mycroft A.I. for the eponym Vocal Assistant, that provides multiple voices, available in different languages and variants.
+
+Its neural network is built upon some very good and some not-so-good models, so try some to be sure you get the best one for your need.
+
+Mimic3 doesn't need Mycroft, and it can be run as a simple command line utility, or as a web server with an API.
+
+This TTS bundle makes use of this last feature, so please take note : this openHAB TTS bundle is NOT a standalone, and it requires the Mimic web server to run somewhere (on your openHAB computer, or your network).
+
+You can find more information about the Mimic web server, and how to install it, on the [official documentation](https://mycroft-ai.gitbook.io/docs/mycroft-technologies/mimic-tts/mimic-3#installation).
+
+It supports a subset of SSML, and if you want to use it, be sure to start your text with `<speak>`.
+
+## Configuration
+
+Using your favorite configuration UI to edit **Settings / Other Services - Mimic Text-to-Speech** and set:
+
+* **url** - Mimic URL. Default to `http://localhost:59125`
+* **speakingRate** - Controls how fast the voice speaks the text. A value of 1 is the speed of the training dataset. Less than 1 is faster, and more than 1 is slower.
+* **audioVolatility** - The amount of noise added to the generated audio (0-1). Can help mask audio artifacts from the voice model. Multi-speaker models tend to sound better with a lower amount of noise than single speaker models.
+* **phonemeVolatility** - The amount of noise used to generate phoneme durations (0-1). Allows for variable speaking cadance, with a value closer to 1 being more variable. Multi-speaker models tend to sound better with a lower amount of phoneme variability than single speaker models.
+
+In case you would like to setup the service via a text file, create a new file in `$OPENHAB_ROOT/conf/services` named `mimictts.cfg`
+
+Its contents should look similar to:
+
+```
+org.openhab.voice.mimictts:url=http://localhost:59125
+org.openhab.voice.mimictts:speakingRate=1
+org.openhab.voice.mimictts:audioVolatility=0.667
+org.openhab.voice.mimictts:phonemeVolatility=0.8
+```
+
+### Default Text-to-Speech and Voice Configuration
+
+You can setup your preferred default Text-to-Speech and default voice in the UI:
+
+* Go to **Settings**.
+* Edit **System Services - Voice**.
+* Set **Mimic** as **Default Text-to-Speech**.
+* Choose your preferred **Default Voice** for your setup.
+
+In case you would like to setup these settings via a text file, you can edit the file `runtime.cfg` in `$OPENHAB_ROOT/conf/services` and set the following entries:
+
+```
+org.openhab.voice:defaultTTS=mimictts
+org.openhab.voice:defaultVoice=mimictts:fr_FR_siwis_low
+```
diff --git a/bundles/org.openhab.voice.mimictts/pom.xml b/bundles/org.openhab.voice.mimictts/pom.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
+
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.openhab.addons.bundles</groupId>
+    <artifactId>org.openhab.addons.reactor.bundles</artifactId>
+    <version>3.4.0-SNAPSHOT</version>
+  </parent>
+
+  <artifactId>org.openhab.voice.mimictts</artifactId>
+
+  <name>openHAB Add-ons :: Bundles :: Voice :: mimic Text-To-Speech</name>
+
+</project>
diff --git a/bundles/org.openhab.voice.mimictts/src/main/feature/feature.xml b/bundles/org.openhab.voice.mimictts/src/main/feature/feature.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<features name="org.openhab.voice.mimictts-${project.version}" xmlns="http://karaf.apache.org/xmlns/features/v1.4.0">
+	<repository>mvn:org.openhab.core.features.karaf/org.openhab.core.features.karaf.openhab-core/${ohc.version}/xml/features</repository>
+
+	<feature name="openhab-voice-mimictts" description="Mimic Text-to-Speech" version="${project.version}">
+		<feature>openhab-runtime-base</feature>
+		<bundle start-level="80">mvn:org.openhab.addons.bundles/org.openhab.voice.mimictts/${project.version}</bundle>
+	</feature>
+</features>
diff --git a/...hab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java b/...hab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicConfiguration.java
@@ -0,0 +1,28 @@
+/**
+ * Copyright (c) 2010-2022 Contributors to the openHAB project
+ *
+ * See the NOTICE file(s) distributed with this work for additional
+ * information.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ */
+package org.openhab.voice.mimic.internal;
+
+import org.eclipse.jdt.annotation.NonNullByDefault;
+
+/**
+ * The {@link MimicConfiguration} class contains fields mapping configuration parameters.
+ *
+ * @author Gwendal Roulleau - Initial contribution
+ */
+@NonNullByDefault
+public class MimicConfiguration {
+    public String url = "http://localhost:59125";
+    public Double speakingRate = 1.0;
+    public Double audioVolatility = 0.667;
+    public Double phonemeVolatility = 0.8;
+}
diff --git a/...penhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java b/...penhab.voice.mimictts/src/main/java/org/openhab/voice/mimic/internal/MimicTTSService.java
@@ -0,0 +1,249 @@
+/**
+ * Copyright (c) 2010-2022 Contributors to the openHAB project
+ *
+ * See the NOTICE file(s) distributed with this work for additional
+ * information.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Public License 2.0 which is available at
+ * http://www.eclipse.org/legal/epl-2.0
+ *
+ * SPDX-License-Identifier: EPL-2.0
+ */
+package org.openhab.voice.mimic.internal;
+
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
+import java.util.HashSet;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.eclipse.jdt.annotation.NonNullByDefault;
+import org.eclipse.jdt.annotation.Nullable;
+import org.openhab.core.audio.AudioFormat;
+import org.openhab.core.audio.AudioStream;
+import org.openhab.core.audio.ByteArrayAudioStream;
+import org.openhab.core.config.core.ConfigurableService;
+import org.openhab.core.io.net.http.HttpRequestBuilder;
+import org.openhab.core.io.net.http.HttpUtil;
+import org.openhab.core.library.types.RawType;
+import org.openhab.core.voice.TTSException;
+import org.openhab.core.voice.TTSService;
+import org.openhab.core.voice.Voice;
+import org.openhab.voice.mimic.internal.dto.VoiceDto;
+import org.osgi.framework.Constants;
+import org.osgi.service.component.annotations.Activate;
+import org.osgi.service.component.annotations.Component;
+import org.osgi.service.component.annotations.Modified;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.google.gson.JsonSyntaxException;
+
+/**
+ * Mimic Voice service implementation.
+ *
+ * @author Gwendal Roulleau - Initial contribution
+ */
+@Component(configurationPid = MimicTTSService.SERVICE_PID, property = Constants.SERVICE_PID + "="
+        + MimicTTSService.SERVICE_PID)
+@ConfigurableService(category = MimicTTSService.SERVICE_CATEGORY, label = MimicTTSService.SERVICE_NAME
+        + " Text-to-Speech", description_uri = MimicTTSService.SERVICE_CATEGORY + ":" + MimicTTSService.SERVICE_ID)
+@NonNullByDefault
+public class MimicTTSService implements TTSService {
+
+    static final String SERVICE_CATEGORY = "voice";
+    static final String SERVICE_ID = "mimictts";
+    static final String SERVICE_PID = "org.openhab." + SERVICE_CATEGORY + "." + SERVICE_ID;
+    static final String SERVICE_NAME = "Mimic";
+
+    /**
+     * Configuration parameters
+     */
+    private static final String PARAM_URL = "url";
+    private static final String PARAM_SPEAKINGRATE = "speakingRate";
+    private static final String PARAM_AUDIOVOLATITLITY = "audioVolatility";
+    private static final String PARAM_PHONEMEVOLATITLITY = "phonemeVolatility";
+
+    /**
+     * Url
+     */
+    private static final String LIST_VOICES_URL = "/api/voices";
+    private static final String SYNTHETIZE_URL = "/api/tts";
+
+    /** The only wave format supported */
+    private static final AudioFormat AUDIO_FORMAT = new AudioFormat(AudioFormat.CONTAINER_WAVE,
+            AudioFormat.CODEC_PCM_SIGNED, false, 16, 52000, 22050L, 1);
+
+    private Set<Voice> availableVoices = new HashSet<>();
+
+    /**
+     * Logger.
+     */
+    private final Logger logger = LoggerFactory.getLogger(MimicTTSService.class);
+
+    private final MimicConfiguration config = new MimicConfiguration();
+
+    private final Gson gson = new GsonBuilder().create();
+
+    @Activate
+    protected void activate(Map<String, Object> config) {
+        updateConfig(config);
+    }
+
+    /**
+     * Called by the framework when the configuration was updated.
+     *
+     * @param newConfig Updated configuration
+     */
+    @Modified
+    private void updateConfig(Map<String, Object> newConfig) {
+        logger.debug("Updating configuration");
+
+        // client id
+        Object param = newConfig.get(PARAM_URL);
+        if (param == null) {
+            logger.warn("Missing URL to access Mimic TTS API. Using localhost");
+        } else {
+            config.url = param.toString();
+        }
+
+        // audio volatility
+        try {
+            param = newConfig.get(PARAM_AUDIOVOLATITLITY);
+            if (param != null) {
+                config.audioVolatility = Double.parseDouble(param.toString());
+            }
+        } catch (NumberFormatException e) {
+            logger.warn("Cannot parse audioVolatility parameter. Using default");
+        }
+
+        // phoneme volatility
+        try {
+            param = newConfig.get(PARAM_PHONEMEVOLATITLITY);
+            if (param != null) {
+                config.phonemeVolatility = Double.parseDouble(param.toString());
+            }
+        } catch (NumberFormatException e) {
+            logger.warn("Cannot parse phonemeVolatility parameter. Using default");
+        }
+
+        // speakingRate
+        try {
+            param = newConfig.get(PARAM_SPEAKINGRATE);
+            if (param != null) {
+                config.speakingRate = Double.parseDouble(param.toString());
+            }
+        } catch (NumberFormatException e) {
+            logger.warn("Cannot parse speakingRate parameter. Using default");
+        }
+
+        refreshVoices();
+    }
+
+    @Override
+    public String getId() {
+        return SERVICE_ID;
+    }
+
+    @Override
+    public String getLabel(@Nullable Locale locale) {
+        return SERVICE_NAME;
+    }
+
+    @Override
+    public Set<Voice> getAvailableVoices() {
+        return availableVoices;
+    }
+
+    public void refreshVoices() {
+        String url = config.url + LIST_VOICES_URL;
+        availableVoices.clear();
+        try {
+            String responseVoices = HttpRequestBuilder.getFrom(url).getContentAsString();
+            VoiceDto[] mimicVoiceResponse = gson.fromJson(responseVoices, VoiceDto[].class);
+            if (mimicVoiceResponse == null) {
+                logger.warn("Cannot get mimic voices from the URL {}", url);
+                return;
+            } else if (mimicVoiceResponse.length == 0) {
+                logger.debug("Voice set response from Mimic is empty ?!");
+                return;
+            }
+            for (VoiceDto voiceDto : mimicVoiceResponse) {
+                if (voiceDto.speakers != null && voiceDto.speakers.size() > 0) {
+                    for (String speaker : voiceDto.speakers) {
+                        availableVoices.add(new MimicVoice(voiceDto.key, voiceDto.language, voiceDto.name, speaker));
+                    }
+                } else {
+                    availableVoices.add(new MimicVoice(voiceDto.key, voiceDto.language, voiceDto.name, null));
+                }
+            }
+        } catch (IOException | JsonSyntaxException e) {
+            logger.warn("Cannot get mimic voices from the URL {}, error {}", url, e.getMessage());
+        }
+    }
+
+    @Override
+    public Set<AudioFormat> getSupportedFormats() {
+        return Set.<AudioFormat> of(AUDIO_FORMAT);
+    }
+
+    /**
+     * Checks parameters and calls the API to synthesize voice.
+     *
+     * @param text Input text.
+     * @param voice Selected voice.
+     * @param requestedFormat Format that is supported by the target sink as well.
+     * @return Output audio stream
+     * @throws TTSException in case the service is unavailable or a parameter is invalid.
+     */
+    @Override
+    public AudioStream synthesize(String text, Voice voice, AudioFormat requestedFormat) throws TTSException {
+
+        if (!availableVoices.contains(voice)) {
+            // let a chance for the service to update :
+            refreshVoices();
+            if (!availableVoices.contains(voice)) {
+                throw new TTSException("Voice " + voice.getUID() + " not available for MimicTTS");
+            }
+        }
+
+        logger.debug("Synthesize '{}' for voice '{}' in format {}", text, voice.getUID(), requestedFormat);
+        // Validate arguments
+        // trim text
+        String trimmedText = text.trim();
+        if (trimmedText.isEmpty()) {
+            throw new TTSException("The passed text is empty");
+        }
+        if (!AUDIO_FORMAT.isCompatible(requestedFormat)) {
+            throw new TTSException("The passed AudioFormat is unsupported");
+        }
+        String encodedText;
+        try {
+            encodedText = URLEncoder.encode(text, StandardCharsets.UTF_8.toString());
+        } catch (UnsupportedEncodingException e) {
+            throw new IllegalArgumentException("Cannot encode text in URL " + text);
+        }
+
+        String ssml = "";
+        if (text.startsWith("<speak>")) {
+            ssml = "&ssml=true";
+        }
+
+        // create the audio byte array for given text, locale, format
+        String urlTTS = config.url + SYNTHETIZE_URL + "?text=" + encodedText + "&voice="
+                + ((MimicVoice) voice).getTechnicalName() + ssml + "&noiseScale=" + config.audioVolatility + "&noiseW="
+                + config.phonemeVolatility + "&lengthScale=" + config.speakingRate + "&audioTarget=client";
+        logger.debug("Querying mimic with URL {}", urlTTS);
+        RawType responseWav = HttpUtil.downloadData(urlTTS, "audio/wav", false, -1);
+        if (responseWav == null) {
+            throw new TTSException("Cannot get wav from mimic url " + urlTTS);
+        }
+        return new ByteArrayAudioStream(responseWav.getBytes(), AUDIO_FORMAT);
+    }
+}