huggingface · ArthurZucker · Oct 17, 2024 · Oct 6, 2024 · Oct 6, 2024 · Oct 7, 2024
@@ -43,7 +43,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
 
 # For video model testing
-RUN python3 -m pip install --no-cache-dir decord av==9.2.0
+RUN python3 -m pip install --no-cache-dir av==9.2.0
 
 # Some slow tests require bnb
 RUN python3 -m pip install --no-cache-dir bitsandbytes

@@ -104,7 +104,6 @@
     "cookiecutter==1.7.3",
     "dataclasses",
     "datasets!=2.5.0",
-    "decord==0.6.0",
     "deepspeed>=0.9.3",
     "diffusers",
     "dill<0.3.5",
@@ -313,7 +312,7 @@ def run(self):
 extras["torch-vision"] = deps_list("torchvision") + extras["vision"]
 extras["natten"] = deps_list("natten")
 extras["codecarbon"] = deps_list("codecarbon")
-extras["video"] = deps_list("decord", "av")
+extras["video"] = deps_list("av")
 
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["tiktoken"] = deps_list("tiktoken", "blobfile")

@@ -936,7 +936,6 @@
         "is_av_available",
         "is_bitsandbytes_available",
         "is_datasets_available",
-        "is_decord_available",
         "is_faiss_available",
         "is_flax_available",
         "is_keras_nlp_available",
@@ -5833,7 +5832,6 @@
         is_av_available,
         is_bitsandbytes_available,
         is_datasets_available,
-        is_decord_available,
         is_faiss_available,
         is_flax_available,
         is_keras_nlp_available,

@@ -11,7 +11,6 @@
     "cookiecutter": "cookiecutter==1.7.3",
     "dataclasses": "dataclasses",
     "datasets": "datasets!=2.5.0",
-    "decord": "decord==0.6.0",
     "deepspeed": "deepspeed>=0.9.3",
     "diffusers": "diffusers",
     "dill": "dill<0.3.5",

@@ -19,6 +19,7 @@
 import argparse
 from pathlib import Path
 
+import av
 import numpy as np
 import requests
 import torch
@@ -193,10 +194,27 @@ def prepare_img(model_name):
 
 
 def prepare_video():
-    from decord import VideoReader, cpu
+    def read_video_pyav(container, indices):
+        """
+        Decode the video with PyAV decoder.
 
-    # set seed for reproducability
-    np.random.seed(0)
+        Args:
+            container (`av.container.input.InputContainer`): PyAV container.
+            indices (`List[int]`): List of frame indices to decode.
+
+        Returns:
+            result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        """
+        frames = []
+        container.seek(0)
+        start_index = indices[0]
+        end_index = indices[-1]
+        for i, frame in enumerate(container.decode(video=0)):
+            if i > end_index:
+                break
+            if i >= start_index and i in indices:
+                frames.append(frame)
-                frames.append(frame)
+                frames.append(frame.to_rgb())
-                frames.append(frame)
+                frames.append(frame.to_rgb())
+        return np.stack([x.to_ndarray(format="rgb24") for x in frames])
 
     def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
         """
@@ -217,16 +235,19 @@ def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
         indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
         return indices
 
-    # video clip consists of 300 frames (10 seconds at 30 FPS)
-    file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset")
-    videoreader = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+    # set seed for reproducibility
+    np.random.seed(0)
 
-    # sample 6 frames
-    videoreader.seek(0)
-    indices = sample_frame_indices(clip_len=6, frame_sample_rate=4, seg_len=len(videoreader))
-    video = videoreader.get_batch(indices).asnumpy()
+    file_path = hf_hub_download(repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset")
+    with av.open(file_path) as container:
+        # sample 6 frames
+        num_frames = 6
+        indices = sample_frame_indices(
+            clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
+        )
+        frames = read_video_pyav(container, indices)
 
-    return video
+        return frames
 
 
 @torch.no_grad()

@@ -67,7 +67,6 @@
     is_compressed_tensors_available,
     is_cv2_available,
     is_cython_available,
-    is_decord_available,
     is_detectron2_available,
     is_eetq_available,
     is_essentia_available,
@@ -758,13 +757,6 @@ def require_spacy(test_case):
     return unittest.skipUnless(is_spacy_available(), "test requires spacy")(test_case)
 
 
-def require_decord(test_case):
-    """
-    Decorator marking a test that requires decord. These tests are skipped when decord isn't installed.
-    """
-    return unittest.skipUnless(is_decord_available(), "test requires decord")(test_case)
-
-
 def require_torch_multi_gpu(test_case):
     """
     Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without

@@ -128,7 +128,6 @@
     is_cv2_available,
     is_cython_available,
     is_datasets_available,
-    is_decord_available,
     is_detectron2_available,
     is_eetq_available,
     is_essentia_available,

@@ -112,7 +112,6 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 # `importlib.metadata.util` doesn't work with `opencv-python-headless`.
 _cv2_available = importlib.util.find_spec("cv2") is not None
 _datasets_available = _is_package_available("datasets")
-_decord_available = importlib.util.find_spec("decord") is not None
 _detectron2_available = _is_package_available("detectron2")
 # We need to check both `faiss` and `faiss-cpu`.
 _faiss_available = importlib.util.find_spec("faiss") is not None
@@ -1173,10 +1172,6 @@ def is_ccl_available():
     return _is_ccl_available
 
 
-def is_decord_available():
-    return _decord_available
-
-
 def is_sudachi_available():
     return _sudachipy_available
 
@@ -1547,10 +1542,6 @@ def is_liger_kernel_available():
 Please note that you may need to restart your runtime after installation.
 """
 
-DECORD_IMPORT_ERROR = """
-{0} requires the decord library but it was not found in your environment. You can install it with pip: `pip install
-decord`. Please note that you may need to restart your runtime after installation.
-"""
 
 CYTHON_IMPORT_ERROR = """
 {0} requires the Cython library but it was not found in your environment. You can install it with pip: `pip install
@@ -1612,7 +1603,6 @@ def is_liger_kernel_available():
         ("scipy", (is_scipy_available, SCIPY_IMPORT_ERROR)),
         ("accelerate", (is_accelerate_available, ACCELERATE_IMPORT_ERROR)),
         ("oneccl_bind_pt", (is_ccl_available, CCL_IMPORT_ERROR)),
-        ("decord", (is_decord_available, DECORD_IMPORT_ERROR)),
         ("cython", (is_cython_available, CYTHON_IMPORT_ERROR)),
         ("jieba", (is_jieba_available, JIEBA_IMPORT_ERROR)),
         ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),

@@ -30,7 +30,7 @@
 from transformers.pipelines import AudioClassificationPipeline, AutomaticSpeechRecognitionPipeline
 from transformers.testing_utils import (
     is_pipeline_test,
-    require_decord,
+    require_av,
     require_pytesseract,
     require_timm,
     require_torch,
@@ -583,14 +583,14 @@ def test_pipeline_translation_fp16(self):
     @is_pipeline_test
     @require_torch_or_tf
     @require_vision
-    @require_decord
+    @require_av
     def test_pipeline_video_classification(self):
         self.run_task_tests(task="video-classification")
 
     @is_pipeline_test
     @require_vision
-    @require_decord
     @require_torch
+    @require_av
     def test_pipeline_video_classification_fp16(self):
         self.run_task_tests(task="video-classification", torch_dtype="float16")