[Feature] Add HVU datatools (open-mmlab#227)

* init commit * fix download annotations * parse hvu tags * add downloading & trimming scripts * add file list generation * finish hvu datatools * resolve comments * init commit * fix download annotations * parse hvu tags * add downloading & trimming scripts * add file list generation * finish hvu datatools * resolve comments * update changelog * Update data_preparation.md Co-authored-by: Jintao Lin <[email protected]>
sibozhang · Sep 30, 2020 · ec6be3b · ec6be3b
1 parent c3330d3
commit ec6be3b
Show file tree

Hide file tree

Showing 13 changed files with 552 additions and 9 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -5,6 +5,7 @@
 **Highlights**
 
 **New Features**
+- Support the data pre-processing pipeline for the HVU Dataset ([#277](https://github.com/open-mmlab/mmaction2/pull/227/))
 - Support to run real-time action recognition from web camera ([#171](https://github.com/open-mmlab/mmaction2/pull/171))
 - Support to export pytorch models to onnx ([#160](https://github.com/open-mmlab/mmaction2/pull/160))
 - Support UCF101-24 preparation ([#219](https://github.com/open-mmlab/mmaction2/pull/219))

diff --git a/docs/data_preparation.md b/docs/data_preparation.md
@@ -22,9 +22,10 @@ To ease usage, we provide tutorials of data deployment for each dataset.
 - [Something-Something V2](https://20bn.com/datasets/something-something): See [preparing_sthv2.md](/tools/data/sthv2/preparing_sthv2.md)
 - [Moments in Time](http://moments.csail.mit.edu/): See [preparing_mit.md](/tools/data/mit/preparing_mit.md)
 - [Multi-Moments in Time](http://moments.csail.mit.edu/challenge_iccv_2019.html): See [preparing_mmit.md](/tools/data/mmit/preparing_mmit.md)
-- ActivityNet_feature: See [praparing_activitynet.md](/tools/data/activitynet/preparing_activitynet.md)
+- [ActivityNet](http://activity-net.org/): See [praparing_activitynet.md](/tools/data/activitynet/preparing_activitynet.md)
 - [UCF101-24](http://www.thumos.info/download.html): See [preparing_ucf101_24.md](/tools/data/ucf101_24/preparing_ucf101_24.md)
 - [JHMDB](http://jhmdb.is.tue.mpg.de/): See [preparing_jhmdb](/tools/data/jhmdb/preparing_jhmdb.md)
+- [HVU](https://github.com/holistic-video-understanding/HVU-Dataset): See [preparing_hvu.md](/tools/data/hvu/preparing_hvu.md)
 
 Now, you can switch to [getting_started.md](getting_started.md) to train and test the model.
 

diff --git a/tools/data/hvu/download.py b/tools/data/hvu/download.py
@@ -0,0 +1,200 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/activitynet/ActivityNet/
+# Original licence: Copyright (c) Microsoft, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import argparse
+import glob
+import os
+import shutil
+import ssl
+import subprocess
+import uuid
+
+import mmcv
+from joblib import Parallel, delayed
+
+ssl._create_default_https_context = ssl._create_unverified_context
+args = None
+
+
+def create_video_folders(dataset, output_dir, tmp_dir):
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+
+
+def construct_video_filename(item, trim_format, output_dir):
+    """Given a dataset row, this function constructs the output filename for a
+    given video."""
+    youtube_id, start_time, end_time = item
+    start_time, end_time = int(start_time * 10), int(end_time * 10)
+    basename = '%s_%s_%s.mp4' % (youtube_id, trim_format % start_time,
+                                 trim_format % end_time)
+    output_filename = os.path.join(output_dir, basename)
+    return output_filename
+
+
+def download_clip(video_identifier,
+                  output_filename,
+                  start_time,
+                  end_time,
+                  tmp_dir='/tmp/hvu',
+                  num_attempts=5,
+                  url_base='https://www.youtube.com/watch?v='):
+    """Download a video from youtube if exists and is not blocked.
+    arguments:
+    ---------
+    video_identifier: str
+        Unique YouTube video identifier (11 characters)
+    output_filename: str
+        File path where the video will be stored.
+    start_time: float
+        Indicates the begining time in seconds from where the video
+        will be trimmed.
+    end_time: float
+        Indicates the ending time in seconds of the trimmed video.
+    """
+    # Defensive argument checking.
+    assert isinstance(video_identifier, str), 'video_identifier must be string'
+    assert isinstance(output_filename, str), 'output_filename must be string'
+    assert len(video_identifier) == 11, 'video_identifier must have length 11'
+
+    status = False
+    tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4())
+
+    if not os.path.exists(output_filename):
+        if not os.path.exists(tmp_filename):
+            command = [
+                'youtube-dl', '--quiet', '--no-warnings',
+                '--no-check-certificate', '-f', 'mp4', '-o',
+                '"%s"' % tmp_filename,
+                '"%s"' % (url_base + video_identifier)
+            ]
+            command = ' '.join(command)
+            print(command)
+            attempts = 0
+            while True:
+                try:
+                    subprocess.check_output(
+                        command, shell=True, stderr=subprocess.STDOUT)
+                except subprocess.CalledProcessError:
+                    attempts += 1
+                    if attempts == num_attempts:
+                        return status, 'Downloading Failed'
+                else:
+                    break
+
+        tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0]
+        # Construct command to trim the videos (ffmpeg required).
+        command = [
+            'ffmpeg', '-i',
+            '"%s"' % tmp_filename, '-ss',
+            str(start_time), '-t',
+            str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy',
+            '-threads', '1', '-loglevel', 'panic',
+            '"%s"' % output_filename
+        ]
+        command = ' '.join(command)
+        try:
+            subprocess.check_output(
+                command, shell=True, stderr=subprocess.STDOUT)
+        except subprocess.CalledProcessError:
+            return status, 'Trimming Failed'
+
+    # Check if the video was successfully saved.
+    status = os.path.exists(output_filename)
+    os.remove(tmp_filename)
+    return status, 'Downloaded'
+
+
+def download_clip_wrapper(item, trim_format, tmp_dir, output_dir):
+    """Wrapper for parallel processing purposes."""
+    output_filename = construct_video_filename(item, trim_format, output_dir)
+    clip_id = os.path.basename(output_filename).split('.mp4')[0]
+    if os.path.exists(output_filename):
+        status = tuple([clip_id, True, 'Exists'])
+        return status
+
+    youtube_id, start_time, end_time = item
+    downloaded, log = download_clip(
+        youtube_id, output_filename, start_time, end_time, tmp_dir=tmp_dir)
+
+    status = tuple([clip_id, downloaded, log])
+    return status
+
+
+def parse_hvu_annotations(input_csv):
+    """Returns a parsed DataFrame.
+    arguments:
+    ---------
+    input_csv: str
+        Path to CSV file containing the following columns:
+          'Tags, youtube_id, time_start, time_end'
+    returns:
+    -------
+    dataset: List of tuples. Each tuple consists of
+        (youtube_id, time_start, time_end). The type of time is float.
+    """
+    lines = open(input_csv).readlines()
+    lines = [x.strip().split(',')[1:] for x in lines[1:]]
+
+    lines = [(x[0], float(x[1]), float(x[2])) for x in lines]
+
+    return lines
+
+
+def main(input_csv,
+         output_dir,
+         trim_format='%06d',
+         num_jobs=24,
+         tmp_dir='/tmp/hvu'):
+    # Reading and parsing HVU.
+    dataset = parse_hvu_annotations(input_csv)
+
+    # Creates folders where videos will be saved later.
+    create_video_folders(dataset, output_dir, tmp_dir)
+
+    # Download all clips.
+    if num_jobs == 1:
+        status_lst = []
+        for item in dataset:
+            status_lst.append(
+                download_clip_wrapper(item, trim_format, tmp_dir, output_dir))
+    else:
+        status_lst = Parallel(n_jobs=num_jobs)(
+            delayed(download_clip_wrapper)(item, trim_format, tmp_dir,
+                                           output_dir) for item in dataset)
+
+    # Clean tmp dir.
+    shutil.rmtree(tmp_dir)
+    # Save download report.
+    mmcv.dump(status_lst, 'download_report.json')
+
+
+if __name__ == '__main__':
+    description = 'Helper script for downloading and trimming HVU videos.'
+    p = argparse.ArgumentParser(description=description)
+    p.add_argument(
+        'input_csv',
+        type=str,
+        help=('CSV file containing the following format: '
+              'Tags, youtube_id, time_start, time_end'))
+    p.add_argument(
+        'output_dir',
+        type=str,
+        help='Output directory where videos will be saved.')
+    p.add_argument(
+        '-f',
+        '--trim-format',
+        type=str,
+        default='%06d',
+        help=('This will be the format for the '
+              'filename of trimmed videos: '
+              'videoid_%0xd(start_time)_%0xd(end_time).mp4. '
+              'Note that the start_time is multiplied by 10 since '
+              'decimal exists somewhere. '))
+    p.add_argument('-n', '--num-jobs', type=int, default=24)
+    p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/hvu')
+    main(**vars(p.parse_args()))
diff --git a/tools/data/hvu/download_annotations.sh b/tools/data/hvu/download_annotations.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+set -e
+
+DATA_DIR="../../../data/hvu/annotations"
+
+if [[ ! -d "${DATA_DIR}" ]]; then
+  echo "${DATA_DIR} does not exist. Creating";
+  mkdir -p ${DATA_DIR}
+fi
+
+git clone https://github.com/holistic-video-understanding/HVU-Dataset.git
+
+cd HVU-Dataset
+unzip -o HVU_Train_V1.0.zip
+unzip -o HVU_Val_V1.0.zip
+cd ..
+mv HVU-Dataset/HVU_Train_V1.0.csv ${DATA_DIR}/hvu_train.csv
+mv HVU-Dataset/HVU_Val_V1.0.csv ${DATA_DIR}/hvu_val.csv
+mv HVU-Dataset/HVU_Tags_Categories_V1.0.csv ${DATA_DIR}/hvu_categories.csv
+
+rm -rf HVU-Dataset
diff --git a/tools/data/hvu/download_videos.sh b/tools/data/hvu/download_videos.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+# set up environment
+conda env create -f environment.yml
+source activate hvu
+pip install --upgrade youtube-dl
+
+DATA_DIR="../../../data/hvu"
+ANNO_DIR="../../../data/hvu/annotations"
+python download.py ${ANNO_DIR}/hvu_train.csv ${DATA_DIR}/videos_train
+python download.py ${ANNO_DIR}/hvu_val.csv ${DATA_DIR}/videos_val
+
+source deactivate hvu
+conda remove -n hvu --all
diff --git a/tools/data/hvu/environment.yml b/tools/data/hvu/environment.yml
@@ -0,0 +1,36 @@
+name: kinetics
+channels:
+  - anaconda
+  - menpo
+  - conda-forge
+  - defaults
+dependencies:
+  - ca-certificates=2020.1.1
+  - certifi=2020.4.5.1
+  - ffmpeg=2.8.6
+  - libcxx=10.0.0
+  - libedit=3.1.20181209
+  - libffi=3.3
+  - ncurses=6.2
+  - openssl=1.1.1g
+  - pip=20.0.2
+  - python=3.7.7
+  - readline=8.0
+  - setuptools=46.4.0
+  - sqlite=3.31.1
+  - tk=8.6.8
+  - wheel=0.34.2
+  - xz=5.2.5
+  - zlib=1.2.11
+  - pip:
+    - decorator==4.4.2
+    - intel-openmp==2019.0
+    - joblib==0.15.1
+    - mkl==2019.0
+    - numpy==1.18.4
+    - olefile==0.46
+    - pandas==1.0.3
+    - python-dateutil==2.8.1
+    - pytz==2020.1
+    - six==1.14.0
+    - youtube-dl==2020.5.8
diff --git a/tools/data/hvu/extract_frames.sh b/tools/data/hvu/extract_frames.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+cd ../
+python build_rawframes.py ../../data/hvu/videos_train/ ../../data/hvu/rawframes_train/ --level 1 --flow-type tvl1 --ext mp4 --task both  --new-short 256
+echo "Raw frames (RGB and tv-l1) Generated for train set"
+
+python build_rawframes.py ../../data/hvu/videos_val/ ../../data/hvu/rawframes_val/ --level 1 --flow-type tvl1 --ext mp4 --task both  --new-short 256
+echo "Raw frames (RGB and tv-l1) Generated for val set"
+
+cd hvu/