SYSTRAN · flyingleafe · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023 · ostegm
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 *.pyc
 *.pyo
 *.pyd
+*.egg-info/
 __pycache__/
 
 # Distribution / Packaging

diff --git a/README.md b/README.md
@@ -195,11 +195,12 @@ For example the command below converts the [original "large-v2" Whisper model](h
 pip install transformers[torch]>=4.23
 
 ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \
-    --copy_files tokenizer.json --quantization float16
+    --copy_files tokenizer.json preprocessor_config.json --quantization float16
 ```
 
 * The option `--model` accepts a model name on the Hub or a path to a model directory.
 * If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
+* If the option `--copy_files preprocessor_config.json` is not used, the melbank feature extractor is instantiated with default parameters. Note that `large-v3` version of Whisper uses different melbank settings in comparison to the previous versions.
 
 Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
 

diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py
@@ -108,7 +108,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str:
     def split_to_word_tokens(
         self, tokens: List[int]
     ) -> Tuple[List[str], List[List[int]]]:
-        if self.language_code in {"zh", "ja", "th", "lo", "my"}:
+        if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
             # These languages don't typically use spaces, so it is difficult to split words
             # without morpheme analysis. Here, we instead split words at any
             # position where the tokens are decoded as valid unicode points
@@ -274,4 +274,5 @@ def split_tokens_on_spaces(
     "yi",
     "yo",
     "zh",
+    "yue",
 )
diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py
@@ -1,4 +1,5 @@
 import itertools
+import json
 import logging
 import os
 import zlib
@@ -142,7 +143,25 @@ def __init__(
                 "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
             )
 
-        self.feature_extractor = FeatureExtractor()
+        feature_extractor_file = os.path.join(model_path, "preprocessor_config.json")
+        if os.path.isfile(feature_extractor_file):
+            with open(feature_extractor_file, "r") as f:
+                config = json.load(f)
+            feat_kwargs = {
+                k: config[k]
+                for k in [
+                    "n_fft",
+                    "hop_length",
+                    "feature_size",
+                    "sampling_rate",
+                    "chunk_length",
+                ]
+                if k in config
+            }
+        else:
+            feat_kwargs = {}
+
+        self.feature_extractor = FeatureExtractor(**feat_kwargs)
         self.num_samples_per_token = self.feature_extractor.hop_length * 2
         self.frames_per_second = (
             self.feature_extractor.sampling_rate // self.feature_extractor.hop_length

diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py
@@ -21,6 +21,7 @@
     "large-v1": "guillaumekln/faster-whisper-large-v1",
     "large-v2": "guillaumekln/faster-whisper-large-v2",
     "large": "guillaumekln/faster-whisper-large-v2",
+    "large-v3": "flyingleafe/faster-whisper-large-v3",
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ @@
     *.pyc
     *.pyo
     *.pyd
+    *.egg-info/
     __pycache__/
     # Distribution / Packaging
@@ Expand Down @@