forked from open-mmlab/mmsegmentation
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] Add HVU datatools (open-mmlab#227)
* init commit * fix download annotations * parse hvu tags * add downloading & trimming scripts * add file list generation * finish hvu datatools * resolve comments * init commit * fix download annotations * parse hvu tags * add downloading & trimming scripts * add file list generation * finish hvu datatools * resolve comments * update changelog * Update data_preparation.md Co-authored-by: Jintao Lin <[email protected]>
- Loading branch information
1 parent
c3330d3
commit ec6be3b
Showing
13 changed files
with
552 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,200 @@ | ||
# ------------------------------------------------------------------------------ | ||
# Adapted from https://github.com/activitynet/ActivityNet/ | ||
# Original licence: Copyright (c) Microsoft, under the MIT License. | ||
# ------------------------------------------------------------------------------ | ||
|
||
import argparse | ||
import glob | ||
import os | ||
import shutil | ||
import ssl | ||
import subprocess | ||
import uuid | ||
|
||
import mmcv | ||
from joblib import Parallel, delayed | ||
|
||
ssl._create_default_https_context = ssl._create_unverified_context | ||
args = None | ||
|
||
|
||
def create_video_folders(dataset, output_dir, tmp_dir): | ||
if not os.path.exists(output_dir): | ||
os.makedirs(output_dir) | ||
if not os.path.exists(tmp_dir): | ||
os.makedirs(tmp_dir) | ||
|
||
|
||
def construct_video_filename(item, trim_format, output_dir): | ||
"""Given a dataset row, this function constructs the output filename for a | ||
given video.""" | ||
youtube_id, start_time, end_time = item | ||
start_time, end_time = int(start_time * 10), int(end_time * 10) | ||
basename = '%s_%s_%s.mp4' % (youtube_id, trim_format % start_time, | ||
trim_format % end_time) | ||
output_filename = os.path.join(output_dir, basename) | ||
return output_filename | ||
|
||
|
||
def download_clip(video_identifier, | ||
output_filename, | ||
start_time, | ||
end_time, | ||
tmp_dir='/tmp/hvu', | ||
num_attempts=5, | ||
url_base='https://www.youtube.com/watch?v='): | ||
"""Download a video from youtube if exists and is not blocked. | ||
arguments: | ||
--------- | ||
video_identifier: str | ||
Unique YouTube video identifier (11 characters) | ||
output_filename: str | ||
File path where the video will be stored. | ||
start_time: float | ||
Indicates the begining time in seconds from where the video | ||
will be trimmed. | ||
end_time: float | ||
Indicates the ending time in seconds of the trimmed video. | ||
""" | ||
# Defensive argument checking. | ||
assert isinstance(video_identifier, str), 'video_identifier must be string' | ||
assert isinstance(output_filename, str), 'output_filename must be string' | ||
assert len(video_identifier) == 11, 'video_identifier must have length 11' | ||
|
||
status = False | ||
tmp_filename = os.path.join(tmp_dir, '%s.%%(ext)s' % uuid.uuid4()) | ||
|
||
if not os.path.exists(output_filename): | ||
if not os.path.exists(tmp_filename): | ||
command = [ | ||
'youtube-dl', '--quiet', '--no-warnings', | ||
'--no-check-certificate', '-f', 'mp4', '-o', | ||
'"%s"' % tmp_filename, | ||
'"%s"' % (url_base + video_identifier) | ||
] | ||
command = ' '.join(command) | ||
print(command) | ||
attempts = 0 | ||
while True: | ||
try: | ||
subprocess.check_output( | ||
command, shell=True, stderr=subprocess.STDOUT) | ||
except subprocess.CalledProcessError: | ||
attempts += 1 | ||
if attempts == num_attempts: | ||
return status, 'Downloading Failed' | ||
else: | ||
break | ||
|
||
tmp_filename = glob.glob('%s*' % tmp_filename.split('.')[0])[0] | ||
# Construct command to trim the videos (ffmpeg required). | ||
command = [ | ||
'ffmpeg', '-i', | ||
'"%s"' % tmp_filename, '-ss', | ||
str(start_time), '-t', | ||
str(end_time - start_time), '-c:v', 'libx264', '-c:a', 'copy', | ||
'-threads', '1', '-loglevel', 'panic', | ||
'"%s"' % output_filename | ||
] | ||
command = ' '.join(command) | ||
try: | ||
subprocess.check_output( | ||
command, shell=True, stderr=subprocess.STDOUT) | ||
except subprocess.CalledProcessError: | ||
return status, 'Trimming Failed' | ||
|
||
# Check if the video was successfully saved. | ||
status = os.path.exists(output_filename) | ||
os.remove(tmp_filename) | ||
return status, 'Downloaded' | ||
|
||
|
||
def download_clip_wrapper(item, trim_format, tmp_dir, output_dir): | ||
"""Wrapper for parallel processing purposes.""" | ||
output_filename = construct_video_filename(item, trim_format, output_dir) | ||
clip_id = os.path.basename(output_filename).split('.mp4')[0] | ||
if os.path.exists(output_filename): | ||
status = tuple([clip_id, True, 'Exists']) | ||
return status | ||
|
||
youtube_id, start_time, end_time = item | ||
downloaded, log = download_clip( | ||
youtube_id, output_filename, start_time, end_time, tmp_dir=tmp_dir) | ||
|
||
status = tuple([clip_id, downloaded, log]) | ||
return status | ||
|
||
|
||
def parse_hvu_annotations(input_csv): | ||
"""Returns a parsed DataFrame. | ||
arguments: | ||
--------- | ||
input_csv: str | ||
Path to CSV file containing the following columns: | ||
'Tags, youtube_id, time_start, time_end' | ||
returns: | ||
------- | ||
dataset: List of tuples. Each tuple consists of | ||
(youtube_id, time_start, time_end). The type of time is float. | ||
""" | ||
lines = open(input_csv).readlines() | ||
lines = [x.strip().split(',')[1:] for x in lines[1:]] | ||
|
||
lines = [(x[0], float(x[1]), float(x[2])) for x in lines] | ||
|
||
return lines | ||
|
||
|
||
def main(input_csv, | ||
output_dir, | ||
trim_format='%06d', | ||
num_jobs=24, | ||
tmp_dir='/tmp/hvu'): | ||
# Reading and parsing HVU. | ||
dataset = parse_hvu_annotations(input_csv) | ||
|
||
# Creates folders where videos will be saved later. | ||
create_video_folders(dataset, output_dir, tmp_dir) | ||
|
||
# Download all clips. | ||
if num_jobs == 1: | ||
status_lst = [] | ||
for item in dataset: | ||
status_lst.append( | ||
download_clip_wrapper(item, trim_format, tmp_dir, output_dir)) | ||
else: | ||
status_lst = Parallel(n_jobs=num_jobs)( | ||
delayed(download_clip_wrapper)(item, trim_format, tmp_dir, | ||
output_dir) for item in dataset) | ||
|
||
# Clean tmp dir. | ||
shutil.rmtree(tmp_dir) | ||
# Save download report. | ||
mmcv.dump(status_lst, 'download_report.json') | ||
|
||
|
||
if __name__ == '__main__': | ||
description = 'Helper script for downloading and trimming HVU videos.' | ||
p = argparse.ArgumentParser(description=description) | ||
p.add_argument( | ||
'input_csv', | ||
type=str, | ||
help=('CSV file containing the following format: ' | ||
'Tags, youtube_id, time_start, time_end')) | ||
p.add_argument( | ||
'output_dir', | ||
type=str, | ||
help='Output directory where videos will be saved.') | ||
p.add_argument( | ||
'-f', | ||
'--trim-format', | ||
type=str, | ||
default='%06d', | ||
help=('This will be the format for the ' | ||
'filename of trimmed videos: ' | ||
'videoid_%0xd(start_time)_%0xd(end_time).mp4. ' | ||
'Note that the start_time is multiplied by 10 since ' | ||
'decimal exists somewhere. ')) | ||
p.add_argument('-n', '--num-jobs', type=int, default=24) | ||
p.add_argument('-t', '--tmp-dir', type=str, default='/tmp/hvu') | ||
main(**vars(p.parse_args())) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
|
||
DATA_DIR="../../../data/hvu/annotations" | ||
|
||
if [[ ! -d "${DATA_DIR}" ]]; then | ||
echo "${DATA_DIR} does not exist. Creating"; | ||
mkdir -p ${DATA_DIR} | ||
fi | ||
|
||
git clone https://github.com/holistic-video-understanding/HVU-Dataset.git | ||
|
||
cd HVU-Dataset | ||
unzip -o HVU_Train_V1.0.zip | ||
unzip -o HVU_Val_V1.0.zip | ||
cd .. | ||
mv HVU-Dataset/HVU_Train_V1.0.csv ${DATA_DIR}/hvu_train.csv | ||
mv HVU-Dataset/HVU_Val_V1.0.csv ${DATA_DIR}/hvu_val.csv | ||
mv HVU-Dataset/HVU_Tags_Categories_V1.0.csv ${DATA_DIR}/hvu_categories.csv | ||
|
||
rm -rf HVU-Dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#!/usr/bin/env bash | ||
|
||
# set up environment | ||
conda env create -f environment.yml | ||
source activate hvu | ||
pip install --upgrade youtube-dl | ||
|
||
DATA_DIR="../../../data/hvu" | ||
ANNO_DIR="../../../data/hvu/annotations" | ||
python download.py ${ANNO_DIR}/hvu_train.csv ${DATA_DIR}/videos_train | ||
python download.py ${ANNO_DIR}/hvu_val.csv ${DATA_DIR}/videos_val | ||
|
||
source deactivate hvu | ||
conda remove -n hvu --all |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
name: kinetics | ||
channels: | ||
- anaconda | ||
- menpo | ||
- conda-forge | ||
- defaults | ||
dependencies: | ||
- ca-certificates=2020.1.1 | ||
- certifi=2020.4.5.1 | ||
- ffmpeg=2.8.6 | ||
- libcxx=10.0.0 | ||
- libedit=3.1.20181209 | ||
- libffi=3.3 | ||
- ncurses=6.2 | ||
- openssl=1.1.1g | ||
- pip=20.0.2 | ||
- python=3.7.7 | ||
- readline=8.0 | ||
- setuptools=46.4.0 | ||
- sqlite=3.31.1 | ||
- tk=8.6.8 | ||
- wheel=0.34.2 | ||
- xz=5.2.5 | ||
- zlib=1.2.11 | ||
- pip: | ||
- decorator==4.4.2 | ||
- intel-openmp==2019.0 | ||
- joblib==0.15.1 | ||
- mkl==2019.0 | ||
- numpy==1.18.4 | ||
- olefile==0.46 | ||
- pandas==1.0.3 | ||
- python-dateutil==2.8.1 | ||
- pytz==2020.1 | ||
- six==1.14.0 | ||
- youtube-dl==2020.5.8 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#!/usr/bin/env bash | ||
|
||
cd ../ | ||
python build_rawframes.py ../../data/hvu/videos_train/ ../../data/hvu/rawframes_train/ --level 1 --flow-type tvl1 --ext mp4 --task both --new-short 256 | ||
echo "Raw frames (RGB and tv-l1) Generated for train set" | ||
|
||
python build_rawframes.py ../../data/hvu/videos_val/ ../../data/hvu/rawframes_val/ --level 1 --flow-type tvl1 --ext mp4 --task both --new-short 256 | ||
echo "Raw frames (RGB and tv-l1) Generated for val set" | ||
|
||
cd hvu/ |
Oops, something went wrong.