t41372 · t41372 · Dec 13, 2024 · Dec 8, 2024 · Dec 9, 2024 · Dec 10, 2024
diff --git a/README.md b/README.md
@@ -128,6 +128,7 @@ Currently supported Speech recognition backend
 - [Whisper](https://github.com/openai/whisper) (local)
 - [Groq Whisper](https://groq.com/) (API Key required). This is a hosted Whisper endpoint, which is fast and has a generous free limit every day.
 - [Azure Speech Recognition](https://azure.microsoft.com/en-us/products/ai-services/speech-to-text) (API Key required)
+- [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) (Local, fast, supports various models including transducer, Paraformer, NeMo CTC, WeNet CTC, Whisper, TDNN CTC, and SenseVoice models.)
 - The microphone in the server terminal will be used by default. You can change the setting `MIC_IN_BROWSER` in the `conf.yaml` to move the microphone (and voice activation detection) to the browser (at the cost of latency, for now). You might want to use the microphone on your client (the browser) rather than the one on your server if you run the backend on a different machine or inside a VM or docker.
 
 Currently supported Text to Speech backend
@@ -139,6 +140,7 @@ Currently supported Text to Speech backend
 - [xTTSv2](https://github.com/daswer123/xtts-api-server) (Local, very resource-consuming)
 - [Edge TTS](https://github.com/rany2/edge-tts) (online, no API key required)
 - [Azure Text-to-Speech](https://azure.microsoft.com/en-us/products/ai-services/text-to-speech) (online, API Key required)
+- [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) (Local, fast, supports various models. For English, piper models are recommended. For pure Chinese, consider using [sherpa-onnx-vits-zh-ll.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2). For a mix of Chinese and English, [vits-melo-tts-zh_en.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2) can be used, though the English pronunciation might not be ideal.)
 
 Fast Text Synthesis
 - Synthesize sentences as soon as they arrive, so there is no need to wait for the entire LLM response.
@@ -205,6 +207,12 @@ Edit the ASR_MODEL settings in the `conf.yaml` to change the provider.
 
 Here are the options you have for speech recognition:
 
+`sherpa-onnx` (local, runs very fast)
+- Install with `pip install sherpa-onnx`.  (~20MB)
+- Download your desired model from [sherpa-onnx ASR models](https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models).
+- Refer to `config_alts` in the repository for configuration examples and modify the model path in your `conf.yaml` accordingly.
+- Offers great performance and is significantly lighter than FunASR.
+
 
 `FunASR` (~~local~~) (Runs very fast even on CPU. Not sure how they did it)
 - [FunASR](https://github.com/modelscope/FunASR?tab=readme-ov-file) is a Fundamental End-to-End Speech Recognition Toolkit from ModelScope that runs many ASR models. The result and speed are pretty good with the SenseVoiceSmall from [FunAudioLLM](https://github.com/FunAudioLLM/SenseVoice) at Alibaba Group.
@@ -250,6 +258,11 @@ WhisperCPP coreML configuration:
 ## Install Speech Synthesis (text to speech) (TTS)
 Install the respective package and turn it on using the `TTS_MODEL` option in `conf.yaml`.
 
+`sherpa-onnx` (local)
+- Install with `pip install sherpa-onnx`.
+- Download your desired model from [sherpa-onnx TTS models](https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models).
+- Refer to `config_alts` in the repository for configuration examples and modify the model path in your `conf.yaml` accordingly.
+
 `pyttsx3TTS` (local, fast)
 - Install with the command `pip install py3-tts`.
 - This package will use the default TTS engine on your system. It uses `sapi5` on Windows, `nsss` on Mac, and `espeak` on other platforms.

diff --git a/asr/asr_factory.py b/asr/asr_factory.py
@@ -51,5 +51,8 @@ def get_asr_system(system_name: str, **kwargs) -> Type[ASRInterface]:
                 model=kwargs.get("model"),
                 lang=kwargs.get("lang"),
             )
+        elif system_name == "SherpaOnnxASR":
+            from .sherpa_onnx_asr import VoiceRecognition as SherpaOnnxASR
+            return SherpaOnnxASR(**kwargs)
         else:
             raise ValueError(f"Unknown ASR system: {system_name}")
diff --git a/asr/sherpa_onnx_asr.py b/asr/sherpa_onnx_asr.py
@@ -0,0 +1,155 @@
+import numpy as np
+import sherpa_onnx
+from .asr_interface import ASRInterface
+
+class VoiceRecognition(ASRInterface):
+
+    def __init__(
+        self,
+        model_type: str = "paraformer",  # or "transducer", "nemo_ctc", "wenet_ctc", "whisper", "tdnn_ctc", "sense_voice"
+        encoder: str = None,  # Path to the encoder model, used with transducer
+        decoder: str = None,  # Path to the decoder model, used with transducer
+        joiner: str = None,  # Path to the joiner model, used with transducer
+        paraformer: str = None,  # Path to the model.onnx from Paraformer
+        nemo_ctc: str = None,  # Path to the model.onnx from NeMo CTC
+        wenet_ctc: str = None,  # Path to the model.onnx from WeNet CTC
+        tdnn_model: str = None,  # Path to the model.onnx for the tdnn model of the yesno recipe
+        whisper_encoder: str = None,  # Path to whisper encoder model
+        whisper_decoder: str = None,  # Path to whisper decoder model
+        sense_voice: str = None, # Path to the model.onnx from SenseVoice
+        tokens: str = None,  # Path to tokens.txt
+        hotwords_file: str = "",  # Path to hotwords file
+        hotwords_score: float = 1.5,  # Hotwords score
+        modeling_unit: str = "",  # Modeling unit for hotwords
+        bpe_vocab: str = "",  # Path to bpe vocabulary, used with hotwords
+        num_threads: int = 1,  # Number of threads for neural network computation
+        whisper_language: str = "",  # Language for whisper model
+        whisper_task: str = "transcribe",  # Task for whisper model (transcribe or translate)
+        whisper_tail_paddings: int = -1,  # Tail padding frames for whisper model
+        blank_penalty: float = 0.0,  # Penalty for blank symbol
+        decoding_method: str = "greedy_search",  # Decoding method (greedy_search or modified_beam_search)
+        debug: bool = False,  # Show debug messages
+        sample_rate: int = 16000,  # Sample rate
+        feature_dim: int = 80,  # Feature dimension
+        use_itn: bool = True, # Use ITN for SenseVoice models
+    ) -> None:
+
+        self.model_type = model_type
+        self.encoder = encoder
+        self.decoder = decoder
+        self.joiner = joiner
+        self.paraformer = paraformer
+        self.nemo_ctc = nemo_ctc
+        self.wenet_ctc = wenet_ctc
+        self.tdnn_model = tdnn_model
+        self.whisper_encoder = whisper_encoder
+        self.whisper_decoder = whisper_decoder
+        self.sense_voice = sense_voice
+        self.tokens = tokens
+        self.hotwords_file = hotwords_file
+        self.hotwords_score = hotwords_score
+        self.modeling_unit = modeling_unit
+        self.bpe_vocab = bpe_vocab
+        self.num_threads = num_threads
+        self.whisper_language = whisper_language
+        self.whisper_task = whisper_task
+        self.whisper_tail_paddings = whisper_tail_paddings
+        self.blank_penalty = blank_penalty
+        self.decoding_method = decoding_method
+        self.debug = debug
+        self.SAMPLE_RATE = sample_rate
+        self.feature_dim = feature_dim
+        self.use_itn = use_itn
+
+        self.asr_with_vad = None
+
+        self.recognizer = self._create_recognizer()
+
+    def _create_recognizer(self):
+        if self.model_type == "transducer":
+            recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
+                encoder=self.encoder,
+                decoder=self.decoder,
+                joiner=self.joiner,
+                tokens=self.tokens,
+                num_threads=self.num_threads,
+                sample_rate=self.SAMPLE_RATE,
+                feature_dim=self.feature_dim,
+                decoding_method=self.decoding_method,
+                hotwords_file=self.hotwords_file,
+                hotwords_score=self.hotwords_score,
+                modeling_unit=self.modeling_unit,
+                bpe_vocab=self.bpe_vocab,
+                blank_penalty=self.blank_penalty,
+                debug=self.debug,
+            )
+        elif self.model_type == "paraformer":
+            recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
+                paraformer=self.paraformer,
+                tokens=self.tokens,
+                num_threads=self.num_threads,
+                sample_rate=self.SAMPLE_RATE,
+                feature_dim=self.feature_dim,
+                decoding_method=self.decoding_method,
+                debug=self.debug,
+            )
+        elif self.model_type == "nemo_ctc":
+            recognizer = sherpa_onnx.OfflineRecognizer.from_nemo_ctc(
+                model=self.nemo_ctc,
+                tokens=self.tokens,
+                num_threads=self.num_threads,
+                sample_rate=self.SAMPLE_RATE,
+                feature_dim=self.feature_dim,
+                decoding_method=self.decoding_method,
+                debug=self.debug,
+            )
+        elif self.model_type == "wenet_ctc":
+            recognizer = sherpa_onnx.OfflineRecognizer.from_wenet_ctc(
+                model=self.wenet_ctc,
+                tokens=self.tokens,
+                num_threads=self.num_threads,
+                sample_rate=self.SAMPLE_RATE,
+                feature_dim=self.feature_dim,
+                decoding_method=self.decoding_method,
+                debug=self.debug,
+            )
+        elif self.model_type == "whisper":
+            recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
+                encoder=self.whisper_encoder,
+                decoder=self.whisper_decoder,
+                tokens=self.tokens,
+                num_threads=self.num_threads,
+                decoding_method=self.decoding_method,
+                debug=self.debug,
+                language=self.whisper_language,
+                task=self.whisper_task,
+                tail_paddings=self.whisper_tail_paddings,
+            )
+        elif self.model_type == "tdnn_ctc":
+            recognizer = sherpa_onnx.OfflineRecognizer.from_tdnn_ctc(
+                model=self.tdnn_model,
+                tokens=self.tokens,
+                sample_rate=self.SAMPLE_RATE,
+                feature_dim=self.feature_dim,
+                num_threads=self.num_threads,
+                decoding_method=self.decoding_method,
+                debug=self.debug,
+            )
+        elif self.model_type == "sense_voice":
+            recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
+                model=self.sense_voice,
+                tokens=self.tokens,
+                num_threads=self.num_threads,
+                use_itn=self.use_itn,
+                debug=self.debug
+            )
+        else:
+            raise ValueError(f"Invalid model type: {self.model_type}")
+
+        return recognizer
+
+    def transcribe_np(self, audio: np.ndarray) -> str:
+        stream = self.recognizer.create_stream()
+        stream.accept_waveform(self.SAMPLE_RATE, audio)
+        self.recognizer.decode_streams([stream])
+        return stream.result.text
diff --git a/conf.yaml b/conf.yaml
@@ -100,7 +100,7 @@ VOICE_INPUT_ON: True
 # Put your mic in the browser or in the terminal? (would increase latency)
 MIC_IN_BROWSER: False # Deprecated and useless now. Do not enable it. Bad things will happen.
 
-# speech to text model options: "Faster-Whisper", "WhisperCPP", "Whisper", "AzureASR", "FunASR", "GroqWhisperASR"
+# speech to text model options: "Faster-Whisper", "WhisperCPP", "Whisper", "AzureASR", "FunASR", "GroqWhisperASR", "SherpaOnnxASR"
 ASR_MODEL: "FunASR"
 
 AzureASR:
@@ -143,6 +143,46 @@ FunASR:
   use_itn: False
   language: "auto" # zh, en, auto
 
+# pip install sherpa-onnx
+# documentation: https://k2-fsa.github.io/sherpa/onnx/index.html
+# ASR models download: https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
+SherpaOnnxASR:
+  model_type: "sense_voice" # "transducer", "paraformer", "nemo_ctc", "wenet_ctc", "whisper", "tdnn_ctc"
+  #  Choose only ONE of the following, depending on the model_type:
+  # --- For model_type: "transducer" ---
+  # encoder: ""        # Path to the encoder model (e.g., "path/to/encoder.onnx")
+  # decoder: ""        # Path to the decoder model (e.g., "path/to/decoder.onnx")
+  # joiner: ""         # Path to the joiner model (e.g., "path/to/joiner.onnx")
+  # --- For model_type: "paraformer" ---
+  # paraformer: ""     # Path to the paraformer model (e.g., "path/to/model.onnx")
+  # --- For model_type: "nemo_ctc" ---
+  # nemo_ctc: ""        # Path to the NeMo CTC model (e.g., "path/to/model.onnx")
+  # --- For model_type: "wenet_ctc" ---
+  # wenet_ctc: ""       # Path to the WeNet CTC model (e.g., "path/to/model.onnx")
+  # --- For model_type: "tdnn_ctc" ---
+  # tdnn_model: ""      # Path to the TDNN CTC model (e.g., "path/to/model.onnx")
+  # --- For model_type: "whisper" ---
+  # whisper_encoder: "" # Path to the Whisper encoder model (e.g., "path/to/encoder.onnx")
+  # whisper_decoder: "" # Path to the Whisper decoder model (e.g., "path/to/decoder.onnx")
+  # --- For model_type: "sense_voice" ---
+  sense_voice: "/path/to/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.onnx" # Path to the SenseVoice model (e.g., "path/to/model.onnx")
+  tokens: "/path/to/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt" # Path to tokens.txt (required for all model types)
+  # --- Optional parameters (with defaults shown) ---
+  # hotwords_file: ""     # Path to hotwords file (if using hotwords)
+  # hotwords_score: 1.5   # Score for hotwords
+  # modeling_unit: ""     # Modeling unit for hotwords (if applicable)
+  # bpe_vocab: ""         # Path to BPE vocabulary (if applicable)
+  num_threads: 4 # Number of threads
+  # whisper_language: "" # Language for Whisper models (e.g., "en", "zh", etc. - if using Whisper)
+  # whisper_task: "transcribe"  # Task for Whisper models ("transcribe" or "translate" - if using Whisper)
+  # whisper_tail_paddings: -1   # Tail padding for Whisper models (if using Whisper)
+  # blank_penalty: 0.0    # Penalty for blank symbol
+  # decoding_method: "greedy_search"  # "greedy_search" or "modified_beam_search"
+  # debug: False # Enable debug mode
+  # sample_rate: 16000 # Sample rate (should match the model's expected sample rate)
+  # feature_dim: 80       # Feature dimension (should match the model's expected feature dimension)
+  use_itn: True # Enable ITN for SenseVoice models (should set to False if not using SenseVoice models)
+
 GroqWhisperASR:
   api_key: ""
   model: "whisper-large-v3-turbo" # or "whisper-large-v3"
@@ -247,6 +287,24 @@ coquiTTS:
   # Device to run model on ("cuda", "cpu", or leave empty for auto-detect)
   device: ""
 
+# pip install sherpa-onnx
+# documentation: https://k2-fsa.github.io/sherpa/onnx/index.html
+# TTS models download: https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
+# see config_alts for more examples 
+SherpaOnnxTTS:
+    vits_model: "/path/to/tts-models/vits-melo-tts-zh_en/model.onnx"  # Path to VITS model file
+    vits_lexicon: "/path/to/tts-models/vits-melo-tts-zh_en/lexicon.txt"  # Path to lexicon file (optional)
+    vits_tokens: "/path/to/tts-models/vits-melo-tts-zh_en/tokens.txt"  # Path to tokens file
+    vits_data_dir: "" # "/path/to/tts-models/vits-piper-en_GB-cori-high/espeak-ng-data"  # Path to espeak-ng data (optional)
+    vits_dict_dir: "/path/to/tts-models/vits-melo-tts-zh_en/dict"  # Path to Jieba dict (optional, for Chinese)
+    tts_rule_fsts: "/path/to/tts-models/vits-melo-tts-zh_en/number.fst,/path/to/tts-models/vits-melo-tts-zh_en/phone.fst,/path/to/tts-models/vits-melo-tts-zh_en/date.fst,/path/to/tts-models/vits-melo-tts-zh_en/new_heteronym.fst" # Path to rule FSTs file (optional)
+    max_num_sentences: 2  # Max sentences per batch (or -1 for all)
+    sid: 1  # Speaker ID (for multi-speaker models)
+    provider: "cpu"  # Use "cpu", "cuda" (GPU), or "coreml" (Apple)
+    num_threads: 1  # Number of computation threads
+    speed: 1.0  # Speech speed (1.0 is normal)
+    debug: false  # Enable debug mode (True/False)
+
 #  ============== Translate (to only change the language for TTS) ==============
 # Like... you speak and read the subtitles in English, and the TTS speaks Japanese or that kind of things