diff --git a/.gitignore b/.gitignore index 8f634c28..e8f318db 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,24 @@ venv/ # Ignore IDE, Editor Files .idea/ .vscode/ + + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST diff --git a/README.md b/README.md index 36320faf..e1b23b50 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ pip install --force-reinstall "faster-whisper @ https://github.com/guillaumekln/ ```python from faster_whisper import WhisperModel -model_size = "large-v2" +model_size = "large-v3" # Run on GPU with FP16 model = WhisperModel(model_size, device="cuda", compute_type="float16") diff --git a/faster_whisper/tokenizer.py b/faster_whisper/tokenizer.py index 1af70b93..c3b13b43 100644 --- a/faster_whisper/tokenizer.py +++ b/faster_whisper/tokenizer.py @@ -108,7 +108,7 @@ def decode_with_timestamps(self, tokens: List[int]) -> str: def split_to_word_tokens( self, tokens: List[int] ) -> Tuple[List[str], List[List[int]]]: - if self.language_code in {"zh", "ja", "th", "lo", "my"}: + if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}: # These languages don't typically use spaces, so it is difficult to split words # without morpheme analysis. Here, we instead split words at any # position where the tokens are decoded as valid unicode points @@ -274,4 +274,5 @@ def split_tokens_on_spaces( "yi", "yo", "zh", + "yue", ) diff --git a/faster_whisper/transcribe.py b/faster_whisper/transcribe.py index 86187fca..15a8d1ba 100644 --- a/faster_whisper/transcribe.py +++ b/faster_whisper/transcribe.py @@ -1,4 +1,5 @@ import itertools +import json import logging import os import zlib @@ -92,8 +93,8 @@ def __init__( Args: model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en, - small, small.en, medium, medium.en, large-v1, large-v2, or large), a path to a converted - model directory, or a CTranslate2-converted Whisper model ID from the Hugging Face Hub. + small, small.en, medium, medium.en, large-v1, large-v2, large-v3, or large), a path to a + converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub. When a size or a model ID is configured, the converted model is downloaded from the Hugging Face Hub. device: Device to use for computation ("cpu", "cuda", "auto"). @@ -113,6 +114,9 @@ def __init__( are saved in the standard Hugging Face cache directory. local_files_only: If True, avoid downloading the file and return the path to the local cached file if it exists. + feature_size: Number of mel filters to use for feature extraction. If not set, + the number of mel filters is inferred from the model version. The first release + used 80 bins, but the large-v3 model uses 128 bins. """ self.logger = get_logger() @@ -142,7 +146,25 @@ def __init__( "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en") ) - self.feature_extractor = FeatureExtractor() + feature_extractor_file = os.path.join(model_path, "preprocessor_config.json") + if os.path.isfile(feature_extractor_file): + with open(feature_extractor_file, "r") as f: + config = json.load(f) + feat_kwargs = { + k: config[k] + for k in [ + "n_fft", + "hop_length", + "feature_size", + "sampling_rate", + "chunk_length", + ] + if k in config + } + else: + feat_kwargs = {} + + self.feature_extractor = FeatureExtractor(**feat_kwargs) self.num_samples_per_token = self.feature_extractor.hop_length * 2 self.frames_per_second = ( self.feature_extractor.sampling_rate // self.feature_extractor.hop_length diff --git a/faster_whisper/utils.py b/faster_whisper/utils.py index f020bc27..8176b905 100644 --- a/faster_whisper/utils.py +++ b/faster_whisper/utils.py @@ -21,6 +21,7 @@ "large-v1": "guillaumekln/faster-whisper-large-v1", "large-v2": "guillaumekln/faster-whisper-large-v2", "large": "guillaumekln/faster-whisper-large-v2", + "large-v3": "bababababooey/faster-whisper-large-v3", } @@ -50,7 +51,7 @@ def download_model( Args: size_or_id: Size of the model to download from https://huggingface.co/guillaumekln (tiny, tiny.en, base, base.en, small, small.en medium, medium.en, large-v1, large-v2, - large), or a CTranslate2-converted model ID from the Hugging Face Hub + large, large-v3), or a CTranslate2-converted model ID from the Hugging Face Hub (e.g. guillaumekln/faster-whisper-large-v2). output_dir: Directory where the model should be saved. If not set, the model is saved in the cache directory. @@ -76,6 +77,7 @@ def download_model( allow_patterns = [ "config.json", + "preprocessor_config.json", "model.bin", "tokenizer.json", "vocabulary.*", diff --git a/faster_whisper/version.py b/faster_whisper/version.py index ca25270f..e1f6d311 100644 --- a/faster_whisper/version.py +++ b/faster_whisper/version.py @@ -1,3 +1,3 @@ """Version information.""" -__version__ = "0.9.0" +__version__ = "0.10.0" diff --git a/requirements.txt b/requirements.txt index fa037f71..d1bb1cfb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ av==10.* -ctranslate2>=3.17,<4 +ctranslate2>=3.21,<4 huggingface_hub>=0.13 tokenizers>=0.13,<0.15 onnxruntime>=1.14,<2