diff --git a/openadapt/scripts/scrub.py b/openadapt/scripts/scrub.py new file mode 100644 index 000000000..0720f885a --- /dev/null +++ b/openadapt/scripts/scrub.py @@ -0,0 +1,133 @@ +"""Module for scrubbing a media file. + +Usage: + $ python -m openadapt.scripts.scrub scrub_mp4 \ + \ + + +Parameters: + mp4_file_path: Path to the mp4 file (str) + scrub_all_entities: True/False + playback_speed_multiplier: (float/int) + crop_start_time: (int) [in seconds] + end_start_time: (int) [in seconds] + + All arguments are required at command line. + +Example: To redact all entities in sample2.mp4 + from the 2nd second to the 16th second and play it at 2x speed: + $ python -m openadapt.scripts.scrub scrub_mp4 sample2.mp4 True 2 2 16 +""" + +from typing import Optional +import math + +from tqdm import tqdm +from PIL import Image +from moviepy.editor import VideoFileClip, VideoClip +from moviepy.video.fx import speedx +import fire +import numpy as np + +from openadapt import config, scrub, utils + + +def _make_frame(time, final, progress_bar, progress_threshold): + """ + Private function to scrub a frame. + + Args: + time: Time (in seconds) + final: Final video clip + progress_bar: Progress bar + frame_count: Total number of frames + progress_interval: Progress interval + progress_threshold: Progress threshold + + Returns: + A Redacted frame + """ + + frame = final.get_frame(time) + + image = Image.fromarray(frame) + + redacted_image = scrub.scrub_image(image) # Redaction + + # Convert redacted image back to OpenCV format + redacted_frame = np.array(redacted_image) + + progress_bar.update(1) # Update the progress bar + + if progress_bar.n >= progress_threshold: + progress_threshold += progress_threshold + + return redacted_frame + + +def scrub_mp4( + mp4_file: str, + scrub_all_entities: bool = False, + playback_speed_multiplier: float = 1.0, + crop_start_time: int = 0, + crop_end_time: Optional[int] = None, +) -> str: + """ + Scrub a mp4 file. + + Args: + mp4_file_path: Path to the mp4 file. + scrub_all_entities: True/False. If true, scrubs all entities + playback_speed_multiplier: Multiplier for playback speed. (float/int) + crop_start_time: Start Time (in seconds) + end_start_time: End Time (in seconds) + + Returns: + Path to the scrubbed (redacted) mp4 file. + """ + + if scrub_all_entities: + config.SCRUB_IGNORE_ENTITIES = [] + + mp4_clip = VideoFileClip(mp4_file) + cropped_clip = mp4_clip.subclip(crop_start_time, crop_end_time) + final = cropped_clip.fx(VideoClip.speedx, playback_speed_multiplier) + + # Prepare progress bar + frame_count = round(final.duration * final.fps) + progress_bar_format = ( + "{desc}: {percentage:.0f}% " + "| {bar} | " + "{n_fmt}/{total_fmt} | {rate_fmt} | [{elapsed}<{remaining}] |" + ) + progress_bar = tqdm( + total=frame_count, + desc="Processing", + unit="frame", + bar_format=progress_bar_format, + colour="green", + ) + progress_interval = 0.1 # Print progress every 10% of frames + progress_threshold = math.floor(frame_count * progress_interval) + + redacted_clip = VideoClip( + make_frame=lambda t: _make_frame( + t, + final, + progress_bar, + progress_threshold, + ), + duration=final.duration, + ) # Redact the clip + + scrubbed_file = mp4_file[:-4] + "_scrubbed.mp4" + redacted_clip.write_videofile( + scrubbed_file, fps=final.fps, logger=None + ) # Write the redacted clip to a file + + progress_bar.close() + return "Scrubbed File Saved at: " + scrubbed_file + + +if __name__ == "__main__": + fire.Fire(utils.get_functions(__name__)) diff --git a/openadapt/scrub.py b/openadapt/scrub.py index 1cfd62a12..dedb691de 100644 --- a/openadapt/scrub.py +++ b/openadapt/scrub.py @@ -45,6 +45,7 @@ def scrub_text(text: str, is_separated: bool = False) -> str: Returns: str: Scrubbed text """ + if text is None: return None @@ -100,6 +101,7 @@ def scrub_image( Returns: PIL.Image: The scrubbed image with PII and PHI removed. """ + redacted_image = IMAGE_REDACTOR.redact( image, fill=fill_color, entities=SCRUBBING_ENTITIES ) @@ -159,6 +161,7 @@ def _scrub_text_item( Returns: str: The scrubbed value """ + if key in ("text", "canonical_text"): return scrub_text(value, is_separated=True) if force_scrub_children: @@ -203,6 +206,7 @@ def _scrub_list_item( Returns: dict/str: The scrubbed dict/value respectively """ + if isinstance(item, dict): return scrub_dict( item, list_keys, force_scrub_children=force_scrub_children @@ -225,6 +229,7 @@ def scrub_dict( Returns: dict: The scrubbed dict with PII and PHI removed. """ + if list_keys is None: list_keys = config.SCRUB_KEYS_HTML @@ -270,6 +275,7 @@ def scrub_list_dicts(input_list: List[Dict], list_keys: List = None) -> List[Dic Returns: list[dict]: The scrubbed list of dicts with PII and PHI removed. """ + scrubbed_list_dicts = [] for input_dict in input_list: scrubbed_list_dicts.append(scrub_dict(input_dict, list_keys)) diff --git a/poetry.lock b/poetry.lock index faff79ce4..3fff85e74 100644 --- a/poetry.lock +++ b/poetry.lock @@ -971,13 +971,13 @@ files = [ [[package]] name = "decorator" -version = "5.1.1" +version = "4.4.2" description = "Decorators for Humans" optional = false -python-versions = ">=3.5" +python-versions = ">=2.6, !=3.0.*, !=3.1.*" files = [ - {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, - {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, + {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"}, + {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"}, ] [[package]] @@ -1609,6 +1609,52 @@ files = [ {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, ] +[[package]] +name = "imageio" +version = "2.31.1" +description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats." +optional = false +python-versions = ">=3.7" +files = [ + {file = "imageio-2.31.1-py3-none-any.whl", hash = "sha256:4106fb395ef7f8dc0262d6aa1bb03daba818445c381ca8b7d5dfc7a2089b04df"}, + {file = "imageio-2.31.1.tar.gz", hash = "sha256:f8436a02af02fd63f272dab50f7d623547a38f0e04a4a73e2b02ae1b8b180f27"}, +] + +[package.dependencies] +numpy = "*" +pillow = ">=8.3.2" + +[package.extras] +all-plugins = ["astropy", "av", "imageio-ffmpeg", "psutil", "tifffile"] +all-plugins-pypy = ["av", "imageio-ffmpeg", "psutil", "tifffile"] +build = ["wheel"] +dev = ["black", "flake8", "fsspec[github]", "pytest", "pytest-cov"] +docs = ["numpydoc", "pydata-sphinx-theme", "sphinx (<6)"] +ffmpeg = ["imageio-ffmpeg", "psutil"] +fits = ["astropy"] +full = ["astropy", "av", "black", "flake8", "fsspec[github]", "gdal", "imageio-ffmpeg", "itk", "numpydoc", "psutil", "pydata-sphinx-theme", "pytest", "pytest-cov", "sphinx (<6)", "tifffile", "wheel"] +gdal = ["gdal"] +itk = ["itk"] +linting = ["black", "flake8"] +pyav = ["av"] +test = ["fsspec[github]", "pytest", "pytest-cov"] +tifffile = ["tifffile"] + +[[package]] +name = "imageio-ffmpeg" +version = "0.4.8" +description = "FFMPEG wrapper for Python" +optional = false +python-versions = ">=3.5" +files = [ + {file = "imageio-ffmpeg-0.4.8.tar.gz", hash = "sha256:fdaa05ad10fe070b7fa8e5f615cb0d28f3b9b791d00af6d2a11e694158d10aa9"}, + {file = "imageio_ffmpeg-0.4.8-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:dba439a303d65061aef17d2ee9324ecfa9c6b4752bd0953b309fdbb79b38451e"}, + {file = "imageio_ffmpeg-0.4.8-py3-none-manylinux2010_x86_64.whl", hash = "sha256:7caa9ce9fc0d7e2f3160ce8cb70a115e5211e0f048e5c1509163d8f89d1080df"}, + {file = "imageio_ffmpeg-0.4.8-py3-none-manylinux2014_aarch64.whl", hash = "sha256:dd3ef9835df91570a1cbd9e36dfbc7d228fca42dbb11636e20df75d719de2949"}, + {file = "imageio_ffmpeg-0.4.8-py3-none-win32.whl", hash = "sha256:0e2688120b3bdb367897450d07c1b1300e96a0bace03ba7de2eb8d738237ea9a"}, + {file = "imageio_ffmpeg-0.4.8-py3-none-win_amd64.whl", hash = "sha256:120d70e6448617cad6213e47dee3a3310117c230f532dd614ed3059a78acf13a"}, +] + [[package]] name = "imagesize" version = "1.4.1" @@ -2367,6 +2413,33 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "moviepy" +version = "1.0.3" +description = "Video editing with Python" +optional = false +python-versions = "*" +files = [ + {file = "moviepy-1.0.3.tar.gz", hash = "sha256:2884e35d1788077db3ff89e763c5ba7bfddbd7ae9108c9bc809e7ba58fa433f5"}, +] + +[package.dependencies] +decorator = ">=4.0.2,<5.0" +imageio = {version = ">=2.5,<3.0", markers = "python_version >= \"3.4\""} +imageio_ffmpeg = {version = ">=0.2.0", markers = "python_version >= \"3.4\""} +numpy = [ + {version = ">=1.17.3", markers = "python_version != \"2.7\""}, + {version = "*", markers = "python_version >= \"2.7\""}, +] +proglog = "<=1.0.0" +requests = ">=2.8.1,<3.0" +tqdm = ">=4.11.2,<5.0" + +[package.extras] +doc = ["Sphinx (>=1.5.2,<2.0)", "numpydoc (>=0.6.0,<1.0)", "pygame (>=1.9.3,<2.0)", "sphinx_rtd_theme (>=0.1.10b0,<1.0)"] +optional = ["matplotlib (>=2.0.0,<3.0)", "opencv-python (>=3.0,<4.0)", "scikit-image (>=0.13.0,<1.0)", "scikit-learn", "scipy (>=0.19.0,<1.5)", "youtube_dl"] +test = ["coverage (<5.0)", "coveralls (>=1.1,<2.0)", "pytest (>=3.0.0,<4.0)", "pytest-cov (>=2.5.1,<3.0)", "requests (>=2.8.1,<3.0)"] + [[package]] name = "mpmath" version = "1.3.0" @@ -3174,6 +3247,20 @@ pydicom = ">=2.3.0" pypng = ">=0.20220715.0" pytesseract = "0.3.7" +[[package]] +name = "proglog" +version = "0.1.10" +description = "Log and progress bar manager for console, notebooks, web..." +optional = false +python-versions = "*" +files = [ + {file = "proglog-0.1.10-py3-none-any.whl", hash = "sha256:19d5da037e8c813da480b741e3fa71fb1ac0a5b02bf21c41577c7f327485ec50"}, + {file = "proglog-0.1.10.tar.gz", hash = "sha256:658c28c9c82e4caeb2f25f488fff9ceace22f8d69b15d0c1c86d64275e4ddab4"}, +] + +[package.dependencies] +tqdm = "*" + [[package]] name = "prompt-toolkit" version = "3.0.38" diff --git a/pyproject.toml b/pyproject.toml index 9d646362f..c3012d1ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,6 +55,7 @@ torchvision = "^0.15.2" sumy = "0.11.0" nltk = "3.8.1" pywinauto = {version = "^0.6.8", markers = "sys_platform == 'win32'"} +moviepy = "1.0.3" python-levenshtein = "^0.21.1" magic-wormhole = "0.12.0" diff --git a/requirements.txt b/requirements.txt index 41df1f83b..5415d1edb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ fuzzywuzzy==0.18.0 ipdb==0.13.11 loguru==0.6.0 matplotlib==3.6.2 +moviepy==1.0.3 mss==6.1.0 nltk==3.8.1 openai==0.27.5 @@ -38,4 +39,4 @@ magic-wormhole==0.12.0 nicegui==1.2.16 transformers==4.29.2 python-dotenv==1.0.0 -python-Levenshtein==0.21.1 +python-Levenshtein==0.21.1 \ No newline at end of file