Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[3.x] Backport text-to-speech support. #61316

Merged
merged 1 commit into from
Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/linux_builds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ jobs:
# The actual dependencies
sudo apt-get install build-essential pkg-config libx11-dev libxcursor-dev \
libxinerama-dev libgl1-mesa-dev libglu-dev libasound2-dev libpulse-dev \
libdbus-1-dev libudev-dev libxi-dev libxrandr-dev yasm xvfb wget unzip
libdbus-1-dev libudev-dev libxi-dev libxrandr-dev yasm xvfb wget unzip \
libspeechd-dev speech-dispatcher

- name: Setup Godot build cache
uses: ./.github/actions/godot-cache
Expand Down
53 changes: 53 additions & 0 deletions core/bind/core_bind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,42 @@ void _OS::global_menu_clear(const String &p_menu) {
OS::get_singleton()->global_menu_clear(p_menu);
}

bool _OS::tts_is_speaking() const {
return OS::get_singleton()->tts_is_speaking();
}

bool _OS::tts_is_paused() const {
return OS::get_singleton()->tts_is_paused();
}

Array _OS::tts_get_voices() const {
return OS::get_singleton()->tts_get_voices();
}

PoolStringArray _OS::tts_get_voices_for_language(const String &p_language) const {
return OS::get_singleton()->tts_get_voices_for_language(p_language);
}

void _OS::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
OS::get_singleton()->tts_speak(p_text, p_voice, p_volume, p_pitch, p_rate, p_utterance_id, p_interrupt);
}

void _OS::tts_pause() {
OS::get_singleton()->tts_pause();
}

void _OS::tts_resume() {
OS::get_singleton()->tts_resume();
}

void _OS::tts_stop() {
OS::get_singleton()->tts_stop();
}

void _OS::tts_set_utterance_callback(TTSUtteranceEvent p_event, Object *p_object, String p_callback) {
OS::get_singleton()->tts_set_utterance_callback((OS::TTSUtteranceEvent)p_event, p_object, p_callback);
}

Point2 _OS::get_mouse_position() const {
return OS::get_singleton()->get_mouse_position();
}
Expand Down Expand Up @@ -1260,6 +1296,18 @@ void _OS::_bind_methods() {
//ClassDB::bind_method(D_METHOD("is_video_mode_resizable","screen"),&_OS::is_video_mode_resizable,DEFVAL(0));
//ClassDB::bind_method(D_METHOD("get_fullscreen_mode_list","screen"),&_OS::get_fullscreen_mode_list,DEFVAL(0));

ClassDB::bind_method(D_METHOD("tts_is_speaking"), &_OS::tts_is_speaking);
ClassDB::bind_method(D_METHOD("tts_is_paused"), &_OS::tts_is_paused);
ClassDB::bind_method(D_METHOD("tts_get_voices"), &_OS::tts_get_voices);
ClassDB::bind_method(D_METHOD("tts_get_voices_for_language", "language"), &_OS::tts_get_voices_for_language);

ClassDB::bind_method(D_METHOD("tts_speak", "text", "voice", "volume", "pitch", "rate", "utterance_id", "interrupt"), &_OS::tts_speak, DEFVAL(50), DEFVAL(1.f), DEFVAL(1.f), DEFVAL(0), DEFVAL(false));
ClassDB::bind_method(D_METHOD("tts_pause"), &_OS::tts_pause);
ClassDB::bind_method(D_METHOD("tts_resume"), &_OS::tts_resume);
ClassDB::bind_method(D_METHOD("tts_stop"), &_OS::tts_stop);

ClassDB::bind_method(D_METHOD("tts_set_utterance_callback", "event", "object", "callback"), &_OS::tts_set_utterance_callback);

ClassDB::bind_method(D_METHOD("global_menu_add_item", "menu", "label", "id", "meta"), &_OS::global_menu_add_item);
ClassDB::bind_method(D_METHOD("global_menu_add_separator", "menu"), &_OS::global_menu_add_separator);
ClassDB::bind_method(D_METHOD("global_menu_remove_item", "menu", "idx"), &_OS::global_menu_remove_item);
Expand Down Expand Up @@ -1568,6 +1616,11 @@ void _OS::_bind_methods() {
BIND_ENUM_CONSTANT(POWERSTATE_NO_BATTERY);
BIND_ENUM_CONSTANT(POWERSTATE_CHARGING);
BIND_ENUM_CONSTANT(POWERSTATE_CHARGED);

BIND_ENUM_CONSTANT(TTS_UTTERANCE_STARTED);
BIND_ENUM_CONSTANT(TTS_UTTERANCE_ENDED);
BIND_ENUM_CONSTANT(TTS_UTTERANCE_CANCELED);
BIND_ENUM_CONSTANT(TTS_UTTERANCE_BOUNDARY);
}

_OS::_OS() {
Expand Down
21 changes: 21 additions & 0 deletions core/bind/core_bind.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,31 @@ class _OS : public Object {
OPENGL_CONTEXT, // HGLRC, X11::GLXContext, NSOpenGLContext*, EGLContext* ...
};

enum TTSUtteranceEvent {
TTS_UTTERANCE_STARTED,
TTS_UTTERANCE_ENDED,
TTS_UTTERANCE_CANCELED,
TTS_UTTERANCE_BOUNDARY,
TTS_UTTERANCE_MAX,
};

void global_menu_add_item(const String &p_menu, const String &p_label, const Variant &p_signal, const Variant &p_meta);
void global_menu_add_separator(const String &p_menu);
void global_menu_remove_item(const String &p_menu, int p_idx);
void global_menu_clear(const String &p_menu);

bool tts_is_speaking() const;
bool tts_is_paused() const;
Array tts_get_voices() const;
PoolStringArray tts_get_voices_for_language(const String &p_language) const;

void tts_speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int p_utterance_id = 0, bool p_interrupt = false);
void tts_pause();
void tts_resume();
void tts_stop();

void tts_set_utterance_callback(TTSUtteranceEvent p_event, Object *p_object, String p_callback);

Point2 get_mouse_position() const;
void set_window_title(const String &p_title);
void set_window_mouse_passthrough(const PoolVector2Array &p_region);
Expand Down Expand Up @@ -413,6 +433,7 @@ VARIANT_ENUM_CAST(_OS::Month);
VARIANT_ENUM_CAST(_OS::SystemDir);
VARIANT_ENUM_CAST(_OS::ScreenOrientation);
VARIANT_ENUM_CAST(_OS::HandleType);
VARIANT_ENUM_CAST(_OS::TTSUtteranceEvent);

class _Geometry : public Object {
GDCLASS(_Geometry, Object);
Expand Down
69 changes: 69 additions & 0 deletions core/os/os.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,75 @@ bool OS::can_use_threads() const {
#endif
}

bool OS::tts_is_speaking() const {
WARN_PRINT("TTS is not supported by this platform.");
return false;
}

bool OS::tts_is_paused() const {
WARN_PRINT("TTS is not supported by this platform.");
return false;
}

void OS::tts_pause() {
WARN_PRINT("TTS is not supported by this platformr.");
}

void OS::tts_resume() {
WARN_PRINT("TTS is not supported by this platform.");
}

Array OS::tts_get_voices() const {
WARN_PRINT("TTS is not supported by this platform.");
return Array();
}

PoolStringArray OS::tts_get_voices_for_language(const String &p_language) const {
PoolStringArray ret;
Array voices = tts_get_voices();
for (int i = 0; i < voices.size(); i++) {
const Dictionary &voice = voices[i];
if (voice.has("id") && voice.has("language") && voice["language"].operator String().begins_with(p_language)) {
ret.push_back(voice["id"]);
}
}
return ret;
}

void OS::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
WARN_PRINT("TTS is not supported by this platform.");
}

void OS::tts_stop() {
WARN_PRINT("TTS is not supported by this platform.");
}

void OS::tts_set_utterance_callback(TTSUtteranceEvent p_event, Object *p_object, const StringName &p_callback) {
ERR_FAIL_INDEX(p_event, OS::TTS_UTTERANCE_MAX);
utterance_callback[p_event].object = p_object;
utterance_callback[p_event].cb_name = p_callback;
}

void OS::tts_post_utterance_event(TTSUtteranceEvent p_event, int p_id, int p_pos) {
ERR_FAIL_INDEX(p_event, OS::TTS_UTTERANCE_MAX);
switch (p_event) {
case OS::TTS_UTTERANCE_STARTED:
case OS::TTS_UTTERANCE_ENDED:
case OS::TTS_UTTERANCE_CANCELED: {
if (utterance_callback[p_event].object != nullptr) {
utterance_callback[p_event].object->call_deferred(utterance_callback[p_event].cb_name, p_id);
}
} break;
case OS::TTS_UTTERANCE_BOUNDARY: {
if (utterance_callback[p_event].object != nullptr) {
utterance_callback[p_event].object->call_deferred(utterance_callback[p_event].cb_name, p_pos, p_id);
}
} break;
default:
break;
}
}

OS::MouseMode OS::get_mouse_mode() const {
return MOUSE_MODE_VISIBLE;
}
Expand Down
39 changes: 39 additions & 0 deletions core/os/os.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,31 @@ class OS {
}
};

struct TTSUtterance {
String text;
String voice;
int volume = 50;
float pitch = 1.f;
float rate = 1.f;
int id = 0;
};

enum TTSUtteranceEvent {
TTS_UTTERANCE_STARTED,
TTS_UTTERANCE_ENDED,
TTS_UTTERANCE_CANCELED,
TTS_UTTERANCE_BOUNDARY,
TTS_UTTERANCE_MAX,
};

private:
struct Callback {
Object *object = nullptr;
StringName cb_name;
};

Callback utterance_callback[TTS_UTTERANCE_MAX];

protected:
friend class Main;

Expand Down Expand Up @@ -172,6 +197,20 @@ class OS {
virtual void set_mouse_mode(MouseMode p_mode);
virtual MouseMode get_mouse_mode() const;

virtual bool tts_is_speaking() const;
virtual bool tts_is_paused() const;
virtual Array tts_get_voices() const;

virtual PoolStringArray tts_get_voices_for_language(const String &p_language) const;

virtual void tts_speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int p_utterance_id = 0, bool p_interrupt = false);
virtual void tts_pause();
virtual void tts_resume();
virtual void tts_stop();

virtual void tts_set_utterance_callback(TTSUtteranceEvent p_event, Object *p_object, const StringName &p_callback);
virtual void tts_post_utterance_event(TTSUtteranceEvent p_event, int p_id, int p_pos = 0);

virtual void warp_mouse_position(const Point2 &p_to) {}
virtual Point2 get_mouse_position() const = 0;
virtual int get_mouse_button_state() const = 0;
Expand Down
100 changes: 100 additions & 0 deletions doc/classes/OS.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1020,6 +1020,94 @@
[b]Note:[/b] This method is implemented on Android, iOS and UWP.
</description>
</method>
<method name="tts_get_voices" qualifiers="const">
<return type="Array" />
<description>
Returns an [Array] of voice information dictionaries.
Each [Dictionary] contains two [String] entries:
- [code]name[/code] is voice name.
- [code]id[/code] is voice identifier.
- [code]language[/code] is language code in [code]lang_Variant[/code] format. [code]lang[/code] part is a 2 or 3-letter code based on the ISO-639 standard, in lowercase. And [code]Variant[/code] part is an engine dependent string describing country, region or/and dialect.
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
<method name="tts_get_voices_for_language" qualifiers="const">
<return type="PoolStringArray" />
<argument index="0" name="language" type="String" />
<description>
Returns an [PoolStringArray] of voice identifiers for the [code]language[/code].
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
<method name="tts_is_paused" qualifiers="const">
<return type="bool" />
<description>
Returns [code]true[/code] if the synthesizer is in a paused state.
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
<method name="tts_is_speaking" qualifiers="const">
<return type="bool" />
<description>
Returns [code]true[/code] if the synthesizer is generating speech, or have utterance waiting in the queue.
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
<method name="tts_pause">
<return type="void" />
<description>
Puts the synthesizer into a paused state.
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
<method name="tts_resume">
<return type="void" />
<description>
Resumes the synthesizer if it was paused.
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
<method name="tts_set_utterance_callback">
<return type="void" />
<argument index="0" name="event" type="int" enum="OS.TTSUtteranceEvent" />
<argument index="1" name="object" type="Object" />
<argument index="2" name="callback" type="String" />
<description>
Adds a callback, which is called when the utterance has started, finished, canceled or reached a text boundary.
- [code]TTS_UTTERANCE_STARTED[/code], [code]TTS_UTTERANCE_ENDED[/code], and [code]TTS_UTTERANCE_CANCELED[/code] callable's method should take one [int] parameter, the utterance id.
- [code]TTS_UTTERANCE_BOUNDARY[/code] callable's method should take two [int] parameters, the index of the character and the utterance id.
[b]Note:[/b] The granularity of the boundary callbacks is engine dependent.
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
<method name="tts_speak">
<return type="void" />
<argument index="0" name="text" type="String" />
<argument index="1" name="voice" type="String" />
<argument index="2" name="volume" type="int" default="50" />
<argument index="3" name="pitch" type="float" default="1.0" />
<argument index="4" name="rate" type="float" default="1.0" />
<argument index="5" name="utterance_id" type="int" default="0" />
<argument index="6" name="interrupt" type="bool" default="false" />
<description>
Adds an utterance to the queue. If [code]interrupt[/code] is [code]true[/code], the queue is cleared first.
- [code]voice[/code] identifier is one of the [code]"id"[/code] values returned by [method tts_get_voices] or one of the values returned by [method tts_get_voices_for_language].
- [code]volume[/code] ranges from [code]0[/code] (lowest) to [code]100[/code] (highest).
- [code]pitch[/code] ranges from [code]0.0[/code] (lowest) to [code]2.0[/code] (highest), [code]1.0[/code] is default pitch for the current voice.
- [code]rate[/code] ranges from [code]0.1[/code] (lowest) to [code]10.0[/code] (highest), [code]1.0[/code] is a normal speaking rate. Other values act as a percentage relative.
- [code]utterance_id[/code] is passed as a parameter to the callback functions.
[b]Note:[/b] On Windows and Linux, utterance [code]text[/code] can use SSML markup. SSML support is engine and voice dependent. If the engine does not support SSML, you should strip out all XML markup before calling [method tts_speak].
[b]Note:[/b] The granularity of pitch, rate, and volume is engine and voice dependent. Values may be truncated.
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
<method name="tts_stop">
<return type="void" />
<description>
Stops synthesis in progress and removes all utterances from the queue.
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
</description>
</method>
</methods>
<members>
<member name="clipboard" type="String" setter="set_clipboard" getter="get_clipboard" default="&quot;&quot;">
Expand Down Expand Up @@ -1246,5 +1334,17 @@
<constant name="POWERSTATE_CHARGED" value="4" enum="PowerState">
Plugged in, battery fully charged.
</constant>
<constant name="TTS_UTTERANCE_STARTED" value="0" enum="TTSUtteranceEvent">
Utterance has begun to be spoken.
</constant>
<constant name="TTS_UTTERANCE_ENDED" value="1" enum="TTSUtteranceEvent">
Utterance was successfully finished.
</constant>
<constant name="TTS_UTTERANCE_CANCELED" value="2" enum="TTSUtteranceEvent">
Utterance was canceled, or TTS service was unable to process it.
</constant>
<constant name="TTS_UTTERANCE_BOUNDARY" value="3" enum="TTSUtteranceEvent">
Utterance reached a word or sentence boundary.
</constant>
</constants>
</class>
1 change: 1 addition & 0 deletions platform/android/SCsub
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ android_files = [
"file_access_filesystem_jandroid.cpp",
"audio_driver_opensl.cpp",
"dir_access_jandroid.cpp",
"tts_android.cpp",
"thread_jandroid.cpp",
"net_socket_android.cpp",
"java_godot_lib_jni.cpp",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import org.godotengine.godot.io.file.FileAccessHandler;
import org.godotengine.godot.plugin.GodotPlugin;
import org.godotengine.godot.plugin.GodotPluginRegistry;
import org.godotengine.godot.tts.GodotTTS;
import org.godotengine.godot.utils.GodotNetUtils;
import org.godotengine.godot.utils.PermissionsUtil;
import org.godotengine.godot.xr.XRMode;
Expand Down Expand Up @@ -254,6 +255,7 @@ protected void instanceSingleton(SingletonBase s) {

public GodotIO io;
public GodotNetUtils netUtils;
public GodotTTS tts;

static SingletonBase[] singletons = new SingletonBase[MAX_SINGLETONS];
static int singleton_count = 0;
Expand Down Expand Up @@ -575,6 +577,7 @@ private void initializeGodot() {
final Activity activity = getActivity();
io = new GodotIO(activity);
netUtils = new GodotNetUtils(activity);
tts = new GodotTTS(activity);
Context context = getContext();
DirectoryAccessHandler directoryAccessHandler = new DirectoryAccessHandler(context);
FileAccessHandler fileAccessHandler = new FileAccessHandler(context);
Expand Down
Loading