Skip to content

Commit

Permalink
Implement chunking of audio and video to allow for larger and longer …
Browse files Browse the repository at this point in the history
…files

Videos are chunked into 1GB parts and Audio is cut into 8min sections.

This allows ViGenAiR to bypass current Google Cloud API limitations of
50GB size and 3h content. Virtually any size / length of video can now
be processed.

Change-Id: Icd84478019ad5ca4ff9fbe12e5caf8b2b8a137ff
  • Loading branch information
mohabfekry committed Sep 19, 2024
1 parent 779bf89 commit ed742e4
Show file tree
Hide file tree
Showing 6 changed files with 898 additions and 162 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ limitations under the License.

Update to the latest version by running `npm run update-app` after pulling the latest changes from the repository via `git pull --rebase --autostash`; you would need to redploy the *UI* for features marked as `frontend`, and *GCP components* for features marked as `backend`.

* [September 2024]
* `backend`: You can now process any video of any length or size - even beyond the Google Cloud Video AI API [limits](https://cloud.google.com/video-intelligence/quotas) of 50 GB size and up to 3h video length.
* [August 2024]
* Updated the [pricing](#pricing-and-quotas) section and Cloud calculator example to use the new (cheaper) pricing for `Gemini 1.5 Flash`.
* `frontend`: You can now manually move the Smart Framing crop area to better capture the point of interest. Read more [here](#3-object-tracking-and-smart-framing).
Expand Down
2 changes: 2 additions & 0 deletions service/.env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,5 @@ CONFIG_VISION_MODEL: gemini-1.5-flash
CONFIG_WHISPER_MODEL: small
CONFIG_ANNOTATIONS_CONFIDENCE_THRESHOLD: '0.7'
CONFIG_MULTIMODAL_ASSET_GENERATION: 'true'
CONFIG_MAX_VIDEO_CHUNK_SIZE: '1000000000' # 1 GB
CONFIG_MAX_AUDIO_CHUNK_SIZE: '480' # 8 minutes
139 changes: 122 additions & 17 deletions service/audio/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
This module contains functions to extract, split and transcribe audio files.
"""

import datetime
import logging
import os
import pathlib
import shutil
from typing import Optional, Tuple
from typing import Optional, Sequence, Tuple

import config as ConfigService
import pandas as pd
Expand All @@ -31,6 +32,103 @@
from iso639 import languages


def combine_audio_files(output_path: str, audio_files: Sequence[str]):
"""Combines audio analysis files into a single file."""
ffmpeg_cmds = ['ffmpeg']
for audio_file in audio_files:
ffmpeg_cmds.extend(['-i', audio_file])

ffmpeg_cmds += ['-filter_complex'] + [
''.join([f'[{index}:0]' for index, _ in enumerate(audio_files)])
+ f'concat=n={len(audio_files)}:v=0:a=1[outa]'
] + ['-map', '[outa]', output_path]

Utils.execute_subprocess_commands(
cmds=ffmpeg_cmds,
description=(
f'Merge {len(audio_files)} audio files and output to {output_path}.'
),
)
os.chmod(output_path, 777)


def combine_analysis_chunks(
analysis_chunks: Sequence[pd.DataFrame]
) -> pd.DataFrame:
"""Combines audio analysis chunks into a single response."""
combined_df = pd.DataFrame()
max_audio_segment_id = 0
max_end_s = 0

for df in analysis_chunks:
df['audio_segment_id'] += max_audio_segment_id
df['start_s'] += max_end_s
df['end_s'] += max_end_s

max_audio_segment_id = df['audio_segment_id'].max()
max_end_s = df['end_s'].max()

combined_df = pd.concat([combined_df, df], ignore_index=True)

return combined_df


def combine_subtitle_files(
audio_output_dir: str,
subtitles_output_path: str,
):
"""Combines audio analysis subtitle files content into a single file."""
subtitles_files = [
str(file_path) for file_path in pathlib.Path(audio_output_dir).
glob(f'*.{ConfigService.OUTPUT_SUBTITLES_TYPE}')
]
logging.info(
'THREADING - Combining %d subtitle files found in %s...',
len(subtitles_files),
audio_output_dir,
)
combined_content = ''
last_timestamp = datetime.datetime.strptime('00:00:00.000', '%H:%M:%S.%f')

for index, subtitles_file in enumerate(subtitles_files):
with open(subtitles_file, 'r', encoding='utf-8') as f:
lines = f.readlines()

if index:
lines = lines[2:]

for line in lines:
if '-->' in line:
start, end = line.strip().split(' --> ')
start_time = last_timestamp + datetime.timedelta(
minutes=int(start[:2]),
seconds=int(start[3:5]),
milliseconds=int(start[6:]),
)
end_time = last_timestamp + datetime.timedelta(
minutes=int(end[:2]),
seconds=int(end[3:5]),
milliseconds=int(end[6:]),
)

start = start_time.strftime('%H:%M:%S.%f')[:-3]
end = end_time.strftime('%H:%M:%S.%f')[:-3]

combined_content += f'{start} --> {end}\n'
else:
combined_content += line

_, end = lines[-3].strip().split(' --> ')
last_timestamp += datetime.timedelta(
minutes=int(end[:2]),
seconds=int(end[3:5]),
milliseconds=int(end[6:]),
)

with open(subtitles_output_path, 'w', encoding='utf-8') as f:
f.write(combined_content)


def extract_audio(video_file_path: str) -> Optional[str]:
"""Extracts the audio track from a video file, if it exists.
Expand Down Expand Up @@ -82,6 +180,7 @@ def extract_audio(video_file_path: str) -> Optional[str]:
def split_audio(
output_dir: str,
audio_file_path: str,
prefix='',
) -> Tuple[str, str]:
"""Splits the audio into vocals and music tracks and returns their paths.
Expand All @@ -108,21 +207,34 @@ def split_audio(
os.rmdir(base_path)

vocals_file_path = str(
pathlib.Path(output_dir, ConfigService.OUTPUT_SPEECH_FILE)
pathlib.Path(output_dir, f'{prefix}{ConfigService.OUTPUT_SPEECH_FILE}')
)
music_file_path = str(
pathlib.Path(output_dir, ConfigService.OUTPUT_MUSIC_FILE)
pathlib.Path(output_dir, f'{prefix}{ConfigService.OUTPUT_MUSIC_FILE}')
)

if prefix:
os.rename(
f'{output_dir}/{ConfigService.OUTPUT_SPEECH_FILE}',
vocals_file_path,
)
os.rename(
f'{output_dir}/{ConfigService.OUTPUT_MUSIC_FILE}',
music_file_path,
)

return vocals_file_path, music_file_path


def transcribe_audio(output_dir: str, audio_file_path: str) -> pd.DataFrame:
def transcribe_audio(
output_dir: str,
audio_file_path: str,
) -> Tuple[pd.DataFrame, str, float]:
"""Transcribes an audio file and returns the transcription.
Args:
output_dir: directory where the transcription will be saved.
audio_file_path: path to the audio file that will be transcribed.
output_dir: Directory where the transcription will be saved.
audio_file_path: Path to the audio file that will be transcribed.
Returns:
A pandas dataframe with the transcription data.
Expand All @@ -139,14 +251,7 @@ def transcribe_audio(output_dir: str, audio_file_path: str) -> pd.DataFrame:
)

video_language = languages.get(alpha2=info.language).name
with open(
f'{output_dir}/{ConfigService.OUTPUT_LANGUAGE_FILE}', 'w', encoding='utf8'
) as f:
f.write(video_language)
logging.info(
'LANGUAGE - %s written successfully!',
ConfigService.OUTPUT_LANGUAGE_FILE,
)
language_probability = info.language_probability

results = list(segments)
results_dict = []
Expand All @@ -162,8 +267,8 @@ def transcribe_audio(output_dir: str, audio_file_path: str) -> pd.DataFrame:
)
writer({'segments': results_dict}, audio_file_path, {'highlight_words': True})
logging.info(
'TRANSCRIPTION - %s written successfully!',
ConfigService.OUTPUT_SUBTITLES_FILE,
'TRANSCRIPTION - transcript for %s written successfully!',
audio_file_path,
)

transcription_data = []
Expand All @@ -185,4 +290,4 @@ def transcribe_audio(output_dir: str, audio_file_path: str) -> pd.DataFrame:
'transcript',
],
)
return transcription_dataframe
return transcription_dataframe, video_language, language_probability
13 changes: 13 additions & 0 deletions service/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@
CONFIG_MULTIMODAL_ASSET_GENERATION = os.environ.get(
'CONFIG_MULTIMODAL_ASSET_GENERATION', 'false'
) == 'true'
CONFIG_MAX_VIDEO_CHUNK_SIZE = float(
os.environ.get(
'CONFIG_MAX_VIDEO_CHUNK_SIZE',
f'{1 * 1e9}' # GB
)
)
CONFIG_MAX_AUDIO_CHUNK_SIZE = float(
os.environ.get(
'CONFIG_MAX_AUDIO_CHUNK_SIZE',
'480' # seconds
)
)

CONFIG_DEFAULT_SAFETY_CONFIG = {
generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: (
Expand Down Expand Up @@ -64,6 +76,7 @@
OUTPUT_DATA_FILE = 'data.json'
OUTPUT_COMBINATIONS_FILE = 'combos.json'
OUTPUT_AV_SEGMENTS_DIR = 'av_segments_cuts'
OUTPUT_ANALYSIS_CHUNKS_DIR = 'analysis_chunks'
OUTPUT_COMBINATION_ASSETS_DIR = 'assets'

GCS_BASE_URL = 'https://storage.mtls.cloud.google.com'
Expand Down
Loading

0 comments on commit ed742e4

Please sign in to comment.