diff --git a/.gitignore b/.gitignore index 8f634c28..d1ccce73 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.pyc *.pyo *.pyd +*.egg-info/ __pycache__/ # Distribution / Packaging diff --git a/README.md b/README.md index 36320faf..2e6c07c4 100644 --- a/README.md +++ b/README.md @@ -195,11 +195,12 @@ For example the command below converts the [original "large-v2" Whisper model](h pip install transformers[torch]>=4.23 ct2-transformers-converter --model openai/whisper-large-v2 --output_dir whisper-large-v2-ct2 \ - --copy_files tokenizer.json --quantization float16 + --copy_files tokenizer.json preprocessor_config.json --quantization float16 ``` * The option `--model` accepts a model name on the Hub or a path to a model directory. * If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later. +* If the option `--copy_files preprocessor_config.json` is not used, the melbank feature extractor is instantiated with default parameters. Note that `large-v3` version of Whisper uses different melbank settings in comparison to the previous versions. Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html). diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 1af70b93..c3b13b43 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -108,7 +108,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str: def split_to_word_tokens( self, tokens: List[int] ) -> Tuple[List[str], List[List[int]]]: - if self.language_code in {"zh", "ja", "th", "lo", "my"}: + if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: # These languages don't typically use spaces, so it is difficult to split words # without morpheme analysis. Here, we instead split words at any # position where the tokens are decoded as valid unicode points @@ -274,4 +274,5 @@ def split_tokens_on_spaces( "yi", "yo", "zh", + "yue", ) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 86187fca..bafa665d 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -1,4 +1,5 @@ import itertools +import json import logging import os import zlib @@ -142,7 +143,25 @@ def __init__( "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en") ) - self.feature_extractor = FeatureExtractor() + feature_extractor_file = os.path.join(model_path, "preprocessor_config.json") + if os.path.isfile(feature_extractor_file): + with open(feature_extractor_file, "r") as f: + config = json.load(f) + feat_kwargs = { + k: config[k] + for k in [ + "n_fft", + "hop_length", + "feature_size", + "sampling_rate", + "chunk_length", + ] + if k in config + } + else: + feat_kwargs = {} + + self.feature_extractor = FeatureExtractor(**feat_kwargs) self.num_samples_per_token = self.feature_extractor.hop_length * 2 self.frames_per_second = ( self.feature_extractor.sampling_rate // self.feature_extractor.hop_length diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index f020bc27..8bff2c26 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -21,6 +21,7 @@ "large-v1": "guillaumekln/faster-whisper-large-v1", "large-v2": "guillaumekln/faster-whisper-large-v2", "large": "guillaumekln/faster-whisper-large-v2", + "large-v3": "flyingleafe/faster-whisper-large-v3", }