From c1b4414fb9f7b2db819f0bea5cbce97be50af2de Mon Sep 17 00:00:00 2001
From: Roberts Slisans <rsxdalv@gmail.com>
Date: Thu, 14 Nov 2024 12:55:55 +0200
Subject: [PATCH] feat: experimental deepspeed on windows (#419)

* add experimental deepspeed wheel for win

* add japanese and italian for bark voice

* fix stable audio layout

* create React UI proxy base

* update README
---
 README.md                                  |  4 ++
 react-ui/src/pages/api/gradio/[name].tsx   | 80 ++++++++++------------
 requirements.txt                           |  1 +
 tts_webui/bark/clone/tab_voice_clone.py    |  2 +
 tts_webui/stable_audio/stable_audio_tab.py |  2 +-
 5 files changed, 45 insertions(+), 44 deletions(-)
diff --git a/README.md b/README.md
index d248f50..99e548d 100644
--- a/README.md
+++ b/README.md
@@ -74,6 +74,10 @@
 
 ## Changelog
 
+Nov 14:
+* Add experimental Windows deepspeed wheel.
+* Add more languages to Bark voice clone.
+
 Nov 11:
 * Switch to a fixed fairseq version for windows reducing installation conflicts and speeding up updates.
 
diff --git a/react-ui/src/pages/api/gradio/[name].tsx b/react-ui/src/pages/api/gradio/[name].tsx
index e3d081a..537c293 100644
--- a/react-ui/src/pages/api/gradio/[name].tsx
+++ b/react-ui/src/pages/api/gradio/[name].tsx
@@ -2,7 +2,12 @@ import { Client } from "@gradio/client";
 import type { NextApiRequest, NextApiResponse } from "next";
 import { getFile } from "../../../backend-utils/getFile";
 import { GradioFile } from "../../../types/GradioFile";
-import { PayloadMessage, PredictFunction } from "@gradio/client/dist/types";
+import {
+  GradioEvent,
+  PayloadMessage,
+  PredictFunction,
+  SubmitIterable,
+} from "@gradio/client/dist/types";
 
 type Data = { data: any };
 
@@ -53,11 +58,40 @@ const extractChoicesTuple = ({ choices }: GradioChoices) =>
 const getChoices = (result: { data: GradioChoices[] }) =>
   extractChoices(result?.data[0]);
 
+const proxyGradioFile = (data: any) =>
+  // typeof data === "object" && data.__type__ === "file"
+  //   // ? new GradioFile(data.url, data.name)
+  //   : data;
+  data
+
+const proxyGradioFiles = (data: any[]) =>
+  Array.isArray(data)
+    ? data.map(proxyGradioFile)
+    : // : typeof data === "object"
+      //   ? Object.fromEntries(
+      //       Object.entries(data).map(([key, value]) => [
+      //         key,
+      //         proxyGradioFiles(value),
+      //       ])
+      //     )
+      data;
+
 const gradioPredict = <T extends any[]>(...args: Parameters<PredictFunction>) =>
-  getClient().then((app) => app.predict(...args)) as Promise<{ data: T }>;
+  // getClient().then((app) => app.predict(...args)) as Promise<{ data: T }>;
+  getClient()
+    .then((app) => app.predict(...args) as Promise<{ data: T }>)
+    .then((result: { data: T }) => ({
+      ...result,
+      data: proxyGradioFiles(result?.data) as T,
+    }));
 
 const gradioSubmit = <T extends any[]>(...args: Parameters<PredictFunction>) =>
-  getClient().then((app) => app.submit(...args));
+  getClient().then(
+    (app) =>
+      app.submit(...args) as SubmitIterable<
+        ({ data: T } & PayloadMessage) | GradioEvent
+      >
+  );
 
 async function musicgen({ melody, model, ...params }) {
   const melodyBlob = await getFile(melody);
@@ -155,11 +189,6 @@ async function bark({
   };
 }
 
-const reload_old_generation_dropdown = () =>
-  gradioPredict<[GradioChoices]>("/reload_old_generation_dropdown").then(
-    getChoices
-  );
-
 const bark_favorite = async ({ folder_root }) =>
   gradioPredict<[Object]>("/bark_favorite", [folder_root]).then(
     (result) => result?.data
@@ -237,15 +266,6 @@ async function tortoise({
   return results.slice(0, -1);
 }
 
-const tortoise_refresh_models = () =>
-  gradioPredict<[GradioChoices]>("/tortoise_refresh_models").then(getChoices);
-
-const tortoise_refresh_voices = () =>
-  gradioPredict<[GradioChoices]>("/tortoise_refresh_voices").then(getChoices);
-
-const tortoise_open_models = () => gradioPredict<[]>("/tortoise_open_models");
-const tortoise_open_voices = () => gradioPredict<[]>("/tortoise_open_voices");
-
 async function tortoise_apply_model_settings({
   model, // string (Option from: ['Default']) in 'parameter_2488' Dropdown component
   kv_cache, // boolean  in 'parameter_2493' Checkbox component
@@ -308,32 +328,6 @@ async function rvc({
 const delete_generation = ({ folder_root }) =>
   gradioPredict<[]>("/delete_generation", [folder_root]);
 
-const save_to_voices = ({ history_npz }) =>
-  gradioPredict<[Object]>("/save_to_voices", [history_npz]);
-
-const save_config_bark = ({
-  text_use_gpu,
-  text_use_small,
-  coarse_use_gpu,
-  coarse_use_small,
-  fine_use_gpu,
-  fine_use_small,
-  codec_use_gpu,
-  load_models_on_startup,
-}) =>
-  gradioPredict<[string]>("/save_config_bark", [
-    text_use_gpu, // boolean  in 'Use GPU' Checkbox component
-    text_use_small, // boolean  in 'Use small model' Checkbox component
-    coarse_use_gpu, // boolean  in 'Use GPU' Checkbox component
-    coarse_use_small, // boolean  in 'Use small model' Checkbox component
-    fine_use_gpu, // boolean  in 'Use GPU' Checkbox component
-    fine_use_small, // boolean  in 'Use small model' Checkbox component
-    codec_use_gpu, // boolean  in 'Use GPU for codec' Checkbox component
-    load_models_on_startup, // boolean  in 'Load Bark models on startup' Checkbox component
-  ]).then((result) => result?.data[0]);
-
-// get_config_bark
-
 async function get_config_bark() {
   const result = await gradioPredict<
     [
diff --git a/requirements.txt b/requirements.txt
index 1a20536..85ff3f5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ beartype>=0.16.1 # workaround for a bug
 # no longer required directly # transformers==4.36.1 # cross-compatibility
 iso639-lang==2.2.3
 pillow==10.3.0 # for gradio, conda fix
+deepspeed @ https://github.com/rsxdalv/DeepSpeed/releases/download/v0.15.5-test/deepspeed-0.15.5+unknown-cp310-cp310-win_amd64.whl ; sys_platform == 'win32' # Apache 2.0
diff --git a/tts_webui/bark/clone/tab_voice_clone.py b/tts_webui/bark/clone/tab_voice_clone.py
index 0e3de01..c455ed1 100644
--- a/tts_webui/bark/clone/tab_voice_clone.py
+++ b/tts_webui/bark/clone/tab_voice_clone.py
@@ -184,6 +184,8 @@ def tab_voice_clone():
                     "es_tokenizer.pth @ Lancer1408/bark-es-tokenizer",
                     "portuguese-HuBERT-quantizer_24_epoch.pth @ MadVoyager/bark-voice-cloning-portuguese-HuBERT-quantizer",
                     "turkish_model_epoch_14.pth @ egeadam/bark-voice-cloning-turkish-HuBERT-quantizer",
+                    "japanese-HuBERT-quantizer_24_epoch.pth @ junwchina/bark-voice-cloning-japanese-HuBERT-quantizer",
+                    "it_tokenizer.pth @ gpwr/bark-it-tokenizer",
                 ],
                 value="quantifier_hubert_base_ls960_14.pth @ GitMylo/bark-voice-cloning",
                 allow_custom_value=True,
diff --git a/tts_webui/stable_audio/stable_audio_tab.py b/tts_webui/stable_audio/stable_audio_tab.py
index 1c707fb..87cffe8 100644
--- a/tts_webui/stable_audio/stable_audio_tab.py
+++ b/tts_webui/stable_audio/stable_audio_tab.py
@@ -222,7 +222,7 @@ def model_select_ui():
                         outputs=[model_select],
                         api_name="stable_audio_refresh_models",
                     )
-            load_model_button = gr.Button(value="Load model")
+                load_model_button = gr.Button(value="Load model")
 
             with gr.Column():
                 gr.Markdown(