huggingface · sanchit-gandhi · May 31, 2023 · May 5, 2023 · May 5, 2023 · May 5, 2023
@@ -132,7 +132,6 @@
     "librosa",
     "nltk",
     "natten>=0.14.6",
-    "numba<0.57.0",  # Can be removed once unpinned.
     "numpy>=1.17",
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
@@ -286,8 +285,7 @@ def run(self):
 extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"]
 
 extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
-# numba can be removed here once unpinned
-extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm", "numba")
+extras["audio"] = deps_list("librosa", "pyctcdecode", "phonemizer", "kenlm")
 # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead
 extras["speech"] = deps_list("torchaudio") + extras["audio"]
 extras["torch-speech"] = deps_list("torchaudio") + extras["audio"]

@@ -37,7 +37,6 @@
     "librosa": "librosa",
     "nltk": "nltk",
     "natten": "natten>=0.14.6",
-    "numba": "numba<0.57.0",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",

@@ -156,7 +156,15 @@ def as_tensor(value):
             as_tensor = jnp.array
             is_tensor = is_jax_tensor
         else:
-            as_tensor = np.asarray
+
+            def as_tensor(value, dtype=None):
+                if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)):
+                    value_lens = [len(val) for val in value]
+                    if len(set(value_lens)) > 1 and dtype is None:
+                        # we have a ragged list so handle explicitly
+                        value = as_tensor([np.asarray(val) for val in value], dtype=object)
+                return np.asarray(value, dtype=dtype)
+
             is_tensor = is_numpy_array
 
         # Do the tensor conversion in batch

@@ -705,7 +705,15 @@ def convert_to_tensors(
             as_tensor = jnp.array
             is_tensor = is_jax_tensor
         else:
-            as_tensor = np.asarray
+
+            def as_tensor(value, dtype=None):
+                if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)):
+                    value_lens = [len(val) for val in value]
+                    if len(set(value_lens)) > 1 and dtype is None:
+                        # we have a ragged list so handle explicitly
+                        value = as_tensor([np.asarray(val) for val in value], dtype=object)
+                return np.asarray(value, dtype=dtype)
+
             is_tensor = is_numpy_array
 
         # Do the tensor conversion in batch

@@ -392,7 +392,7 @@ def test_training(self):
                 b"This is the fourth record.",
                 b"This is the fifth record.",
             ],
-            dtype=np.object,
+            dtype=object,
         )
         retriever = RealmRetriever(block_records, tokenizer)
         model = RealmForOpenQA(openqa_config, retriever)

@@ -100,7 +100,7 @@ def get_dummy_block_records(self):
                 b"This is the fifth record",
                 b"This is a longer longer longer record",
             ],
-            dtype=np.object,
+            dtype=object,
         )
         return block_records
 
@@ -116,7 +116,7 @@ def test_retrieve(self):
         retriever = self.get_dummy_retriever()
         tokenizer = retriever.tokenizer
 
-        retrieved_block_ids = np.array([0, 3], dtype=np.long)
+        retrieved_block_ids = np.array([0, 3], dtype="long")
         question_input_ids = tokenizer(["Test question"]).input_ids
         answer_ids = tokenizer(
             ["the fourth"],
@@ -151,7 +151,7 @@ def test_block_has_answer(self):
         retriever = self.get_dummy_retriever()
         tokenizer = retriever.tokenizer
 
-        retrieved_block_ids = np.array([0, 3, 5], dtype=np.long)
+        retrieved_block_ids = np.array([0, 3, 5], dtype="long")
         question_input_ids = tokenizer(["Test question"]).input_ids
         answer_ids = tokenizer(
             ["the fourth", "longer longer"],