Implement chunking of audio and video to allow for larger and longer …

…files Videos are chunked into 1GB parts and Audio is cut into 8min sections. This allows ViGenAiR to bypass current Google Cloud API limitations of 50GB size and 3h content. Virtually any size / length of video can now be processed. Change-Id: Icd84478019ad5ca4ff9fbe12e5caf8b2b8a137ff
google-marketing-solutions · Sep 19, 2024 · ed742e4 · ed742e4
1 parent 779bf89
commit ed742e4
Show file tree

Hide file tree

Showing 6 changed files with 898 additions and 162 deletions.
diff --git a/README.md b/README.md
@@ -32,6 +32,8 @@ limitations under the License.
 
 Update to the latest version by running `npm run update-app` after pulling the latest changes from the repository via `git pull --rebase --autostash`; you would need to redploy the *UI* for features marked as `frontend`, and *GCP components* for features marked as `backend`.
 
+* [September 2024]
+  * `backend`: You can now process any video of any length or size - even beyond the Google Cloud Video AI API [limits](https://cloud.google.com/video-intelligence/quotas) of 50 GB size and up to 3h video length.
 * [August 2024]
   * Updated the [pricing](#pricing-and-quotas) section and Cloud calculator example to use the new (cheaper) pricing for `Gemini 1.5 Flash`.
   * `frontend`: You can now manually move the Smart Framing crop area to better capture the point of interest. Read more [here](#3-object-tracking-and-smart-framing).

diff --git a/service/.env.yaml b/service/.env.yaml
@@ -19,3 +19,5 @@ CONFIG_VISION_MODEL: gemini-1.5-flash
 CONFIG_WHISPER_MODEL: small
 CONFIG_ANNOTATIONS_CONFIDENCE_THRESHOLD: '0.7'
 CONFIG_MULTIMODAL_ASSET_GENERATION: 'true'
+CONFIG_MAX_VIDEO_CHUNK_SIZE: '1000000000' # 1 GB
+CONFIG_MAX_AUDIO_CHUNK_SIZE: '480' # 8 minutes
diff --git a/service/audio/audio.py b/service/audio/audio.py
@@ -17,11 +17,12 @@
 This module contains functions to extract, split and transcribe audio files.
 """
 
+import datetime
 import logging
 import os
 import pathlib
 import shutil
-from typing import Optional, Tuple
+from typing import Optional, Sequence, Tuple
 
 import config as ConfigService
 import pandas as pd
@@ -31,6 +32,103 @@
 from iso639 import languages
 
 
+def combine_audio_files(output_path: str, audio_files: Sequence[str]):
+  """Combines audio analysis files into a single file."""
+  ffmpeg_cmds = ['ffmpeg']
+  for audio_file in audio_files:
+    ffmpeg_cmds.extend(['-i', audio_file])
+
+  ffmpeg_cmds += ['-filter_complex'] + [
+      ''.join([f'[{index}:0]' for index, _ in enumerate(audio_files)])
+      + f'concat=n={len(audio_files)}:v=0:a=1[outa]'
+  ] + ['-map', '[outa]', output_path]
+
+  Utils.execute_subprocess_commands(
+      cmds=ffmpeg_cmds,
+      description=(
+          f'Merge {len(audio_files)} audio files and output to {output_path}.'
+      ),
+  )
+  os.chmod(output_path, 777)
+
+
+def combine_analysis_chunks(
+    analysis_chunks: Sequence[pd.DataFrame]
+) -> pd.DataFrame:
+  """Combines audio analysis chunks into a single response."""
+  combined_df = pd.DataFrame()
+  max_audio_segment_id = 0
+  max_end_s = 0
+
+  for df in analysis_chunks:
+    df['audio_segment_id'] += max_audio_segment_id
+    df['start_s'] += max_end_s
+    df['end_s'] += max_end_s
+
+    max_audio_segment_id = df['audio_segment_id'].max()
+    max_end_s = df['end_s'].max()
+
+    combined_df = pd.concat([combined_df, df], ignore_index=True)
+
+  return combined_df
+
+
+def combine_subtitle_files(
+    audio_output_dir: str,
+    subtitles_output_path: str,
+):
+  """Combines audio analysis subtitle files content into a single file."""
+  subtitles_files = [
+      str(file_path) for file_path in pathlib.Path(audio_output_dir).
+      glob(f'*.{ConfigService.OUTPUT_SUBTITLES_TYPE}')
+  ]
+  logging.info(
+      'THREADING - Combining %d subtitle files found in %s...',
+      len(subtitles_files),
+      audio_output_dir,
+  )
+  combined_content = ''
+  last_timestamp = datetime.datetime.strptime('00:00:00.000', '%H:%M:%S.%f')
+
+  for index, subtitles_file in enumerate(subtitles_files):
+    with open(subtitles_file, 'r', encoding='utf-8') as f:
+      lines = f.readlines()
+
+      if index:
+        lines = lines[2:]
+
+      for line in lines:
+        if '-->' in line:
+          start, end = line.strip().split(' --> ')
+          start_time = last_timestamp + datetime.timedelta(
+              minutes=int(start[:2]),
+              seconds=int(start[3:5]),
+              milliseconds=int(start[6:]),
+          )
+          end_time = last_timestamp + datetime.timedelta(
+              minutes=int(end[:2]),
+              seconds=int(end[3:5]),
+              milliseconds=int(end[6:]),
+          )
+
+          start = start_time.strftime('%H:%M:%S.%f')[:-3]
+          end = end_time.strftime('%H:%M:%S.%f')[:-3]
+
+          combined_content += f'{start} --> {end}\n'
+        else:
+          combined_content += line
+
+      _, end = lines[-3].strip().split(' --> ')
+      last_timestamp += datetime.timedelta(
+          minutes=int(end[:2]),
+          seconds=int(end[3:5]),
+          milliseconds=int(end[6:]),
+      )
+
+  with open(subtitles_output_path, 'w', encoding='utf-8') as f:
+    f.write(combined_content)
+
+
 def extract_audio(video_file_path: str) -> Optional[str]:
   """Extracts the audio track from a video file, if it exists.
 
@@ -82,6 +180,7 @@ def extract_audio(video_file_path: str) -> Optional[str]:
 def split_audio(
     output_dir: str,
     audio_file_path: str,
+    prefix='',
 ) -> Tuple[str, str]:
   """Splits the audio into vocals and music tracks and returns their paths.
 
@@ -108,21 +207,34 @@ def split_audio(
   os.rmdir(base_path)
 
   vocals_file_path = str(
-      pathlib.Path(output_dir, ConfigService.OUTPUT_SPEECH_FILE)
+      pathlib.Path(output_dir, f'{prefix}{ConfigService.OUTPUT_SPEECH_FILE}')
   )
   music_file_path = str(
-      pathlib.Path(output_dir, ConfigService.OUTPUT_MUSIC_FILE)
+      pathlib.Path(output_dir, f'{prefix}{ConfigService.OUTPUT_MUSIC_FILE}')
   )
 
+  if prefix:
+    os.rename(
+        f'{output_dir}/{ConfigService.OUTPUT_SPEECH_FILE}',
+        vocals_file_path,
+    )
+    os.rename(
+        f'{output_dir}/{ConfigService.OUTPUT_MUSIC_FILE}',
+        music_file_path,
+    )
+
   return vocals_file_path, music_file_path
 
 
-def transcribe_audio(output_dir: str, audio_file_path: str) -> pd.DataFrame:
+def transcribe_audio(
+    output_dir: str,
+    audio_file_path: str,
+) -> Tuple[pd.DataFrame, str, float]:
   """Transcribes an audio file and returns the transcription.
 
   Args:
-    output_dir: directory where the transcription will be saved.
-    audio_file_path: path to the audio file that will be transcribed.
+    output_dir: Directory where the transcription will be saved.
+    audio_file_path: Path to the audio file that will be transcribed.
 
   Returns:
     A pandas dataframe with the transcription data.
@@ -139,14 +251,7 @@ def transcribe_audio(output_dir: str, audio_file_path: str) -> pd.DataFrame:
   )
 
   video_language = languages.get(alpha2=info.language).name
-  with open(
-      f'{output_dir}/{ConfigService.OUTPUT_LANGUAGE_FILE}', 'w', encoding='utf8'
-  ) as f:
-    f.write(video_language)
-  logging.info(
-      'LANGUAGE - %s written successfully!',
-      ConfigService.OUTPUT_LANGUAGE_FILE,
-  )
+  language_probability = info.language_probability
 
   results = list(segments)
   results_dict = []
@@ -162,8 +267,8 @@ def transcribe_audio(output_dir: str, audio_file_path: str) -> pd.DataFrame:
   )
   writer({'segments': results_dict}, audio_file_path, {'highlight_words': True})
   logging.info(
-      'TRANSCRIPTION - %s written successfully!',
-      ConfigService.OUTPUT_SUBTITLES_FILE,
+      'TRANSCRIPTION - transcript for %s written successfully!',
+      audio_file_path,
   )
 
   transcription_data = []
@@ -185,4 +290,4 @@ def transcribe_audio(output_dir: str, audio_file_path: str) -> pd.DataFrame:
           'transcript',
       ],
   )
-  return transcription_dataframe
+  return transcription_dataframe, video_language, language_probability
diff --git a/service/config/config.py b/service/config/config.py
@@ -34,6 +34,18 @@
 CONFIG_MULTIMODAL_ASSET_GENERATION = os.environ.get(
     'CONFIG_MULTIMODAL_ASSET_GENERATION', 'false'
 ) == 'true'
+CONFIG_MAX_VIDEO_CHUNK_SIZE = float(
+    os.environ.get(
+        'CONFIG_MAX_VIDEO_CHUNK_SIZE',
+        f'{1 * 1e9}'  # GB
+    )
+)
+CONFIG_MAX_AUDIO_CHUNK_SIZE = float(
+    os.environ.get(
+        'CONFIG_MAX_AUDIO_CHUNK_SIZE',
+        '480'  # seconds
+    )
+)
 
 CONFIG_DEFAULT_SAFETY_CONFIG = {
     generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: (
@@ -64,6 +76,7 @@
 OUTPUT_DATA_FILE = 'data.json'
 OUTPUT_COMBINATIONS_FILE = 'combos.json'
 OUTPUT_AV_SEGMENTS_DIR = 'av_segments_cuts'
+OUTPUT_ANALYSIS_CHUNKS_DIR = 'analysis_chunks'
 OUTPUT_COMBINATION_ASSETS_DIR = 'assets'
 
 GCS_BASE_URL = 'https://storage.mtls.cloud.google.com'