diff --git a/setup.py b/setup.py index 19754eaf3f2..8406adc3072 100644 --- a/setup.py +++ b/setup.py @@ -132,7 +132,6 @@ "librosa", "nltk", "natten>=0.14.6", - "numba<0.57.0", # Can be removed once unpinned. "numpy>=1.17", "onnxconverter-common", "onnxruntime-tools>=1.4.2", @@ -286,8 +285,7 @@ def run(self): extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"] extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") -# numba can be removed here once unpinned -extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm", "numba") +extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm") # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead extras["speech"] = deps_list("torchaudio") + extras["audio"] extras["torch-speech"] = deps_list("torchaudio") + extras["audio"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 4819b959cb8..869e96d7fbf 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -37,7 +37,6 @@ "librosa": "librosa", "nltk": "nltk", "natten": "natten>=0.14.6", - "numba": "numba<0.57.0", "numpy": "numpy>=1.17", "onnxconverter-common": "onnxconverter-common", "onnxruntime-tools": "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index 7823c4cf9e6..9caf2b9f722 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -156,7 +156,15 @@ def as_tensor(value): as_tensor = jnp.array is_tensor = is_jax_tensor else: - as_tensor = np.asarray + + def as_tensor(value, dtype=None): + if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)): + value_lens = [len(val) for val in value] + if len(set(value_lens)) > 1 and dtype is None: + # we have a ragged list so handle explicitly + value = as_tensor([np.asarray(val) for val in value], dtype=object) + return np.asarray(value, dtype=dtype) + is_tensor = is_numpy_array # Do the tensor conversion in batch diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index ecc9d5011d7..4b65b56f7a9 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -705,7 +705,15 @@ def convert_to_tensors( as_tensor = jnp.array is_tensor = is_jax_tensor else: - as_tensor = np.asarray + + def as_tensor(value, dtype=None): + if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)): + value_lens = [len(val) for val in value] + if len(set(value_lens)) > 1 and dtype is None: + # we have a ragged list so handle explicitly + value = as_tensor([np.asarray(val) for val in value], dtype=object) + return np.asarray(value, dtype=dtype) + is_tensor = is_numpy_array # Do the tensor conversion in batch diff --git a/tests/models/realm/test_modeling_realm.py b/tests/models/realm/test_modeling_realm.py index 228e0344b8b..ddd6c264501 100644 --- a/tests/models/realm/test_modeling_realm.py +++ b/tests/models/realm/test_modeling_realm.py @@ -392,7 +392,7 @@ def test_training(self): b"This is the fourth record.", b"This is the fifth record.", ], - dtype=np.object, + dtype=object, ) retriever = RealmRetriever(block_records, tokenizer) model = RealmForOpenQA(openqa_config, retriever) diff --git a/tests/models/realm/test_retrieval_realm.py b/tests/models/realm/test_retrieval_realm.py index 939d9844004..ba65a6afdd6 100644 --- a/tests/models/realm/test_retrieval_realm.py +++ b/tests/models/realm/test_retrieval_realm.py @@ -100,7 +100,7 @@ def get_dummy_block_records(self): b"This is the fifth record", b"This is a longer longer longer record", ], - dtype=np.object, + dtype=object, ) return block_records @@ -116,7 +116,7 @@ def test_retrieve(self): retriever = self.get_dummy_retriever() tokenizer = retriever.tokenizer - retrieved_block_ids = np.array([0, 3], dtype=np.long) + retrieved_block_ids = np.array([0, 3], dtype="long") question_input_ids = tokenizer(["Test question"]).input_ids answer_ids = tokenizer( ["the fourth"], @@ -151,7 +151,7 @@ def test_block_has_answer(self): retriever = self.get_dummy_retriever() tokenizer = retriever.tokenizer - retrieved_block_ids = np.array([0, 3, 5], dtype=np.long) + retrieved_block_ids = np.array([0, 3, 5], dtype="long") question_input_ids = tokenizer(["Test question"]).input_ids answer_ids = tokenizer( ["the fourth", "longer longer"],