Implement key frames logic for DG image assets

We now identify key frames via Gemini for all generated variants and extract those frames as additional Demand Gen image assets. The existing logic of extracting video thumbnails via ffmpeg (see https://ffmpeg.org/ffmpeg-filters.html#thumbnail) remains the same. This means that users may observe duplicate DG assets (as both thumbnails and Gemini may select the same frame), but that should not be a problem as users can freely select the image assets they want to use. This change also fixes an issue with `utils.py` where VideoMetadata could not be initialised properly for folders with more than 2 subdirectories. This has now been fixed, as the correct folder we need (root GCS folder) is always the penultimate element in the `.parents` list of the file path. Change-Id: I0383a34103c40578776fedaa591bb6e9593c4000
google-marketing-solutions · Jan 10, 2025 · 4636639 · 4636639
1 parent 29cf6ac
commit 4636639
Show file tree

Hide file tree

Showing 4 changed files with 170 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ limitations under the License.
 Update to the latest version by running `npm run update-app` after pulling the latest changes from the repository via `git pull --rebase --autostash`; you would need to redploy the *UI* for features marked as `frontend`, and *GCP components* for features marked as `backend`.
 
 * [January 2025] Happy New Year!
+  * `backend`: Added functionality to identify key frames using Gemini and extract them as additional Demand Gen image assets.
   * `backend`: Improved the extraction process to maintain consistency across the generated descriptions and keywords per segment.
 * [December 2024]
   * `frontend`: Added functionality to generate Demand Gen text assets in a desired target language. Read more [here](#6-output-videos).

diff --git a/service/combiner/combiner.py b/service/combiner/combiner.py
@@ -731,7 +731,13 @@ def _render_video_variant(
       }
   }
   if video_variant.render_settings.generate_image_assets:
+    StorageService.upload_gcs_dir(
+        source_directory=output_dir,
+        bucket_name=gcs_bucket_name,
+        target_dir=gcs_folder_path,
+    )
     assets = _generate_image_assets(
+        vision_model=vision_model,
         video_file_path=horizontal_combo_path,
         gcs_bucket_name=gcs_bucket_name,
         gcs_folder_path=gcs_folder_path,
@@ -774,6 +780,7 @@ def _render_video_variant(
           continuous_audio_select_filter=continuous_audio_select_filter,
       )
     rendered_paths[format_type] = _render_format(
+        vision_model=vision_model,
         input_video_path=horizontal_combo_path,
         output_path=output_dir,
         gcs_bucket_name=gcs_bucket_name,
@@ -865,6 +872,7 @@ def _get_variant_ffmpeg_commands(
 
 
 def _render_format(
+    vision_model: GenerativeModel,
     input_video_path: str,
     output_path: str,
     gcs_bucket_name: str,
@@ -878,6 +886,7 @@ def _render_format(
   """Renders a video variant in a specific format.
 
   Args:
+    vision_model: The generative vision model to use.
     input_video_path: The path to the input video to render.
     output_path: The path to output to.
     gcs_bucket_name: The name of the GCS bucket to upload to.
@@ -925,7 +934,13 @@ def _render_format(
       'path': format_name,
   }
   if generate_image_assets:
+    StorageService.upload_gcs_dir(
+        source_directory=output_path,
+        bucket_name=gcs_bucket_name,
+        target_dir=gcs_folder_path,
+    )
     assets = _generate_image_assets(
+        vision_model=vision_model,
         video_file_path=output_video_path,
         gcs_bucket_name=gcs_bucket_name,
         gcs_folder_path=gcs_folder_path,
@@ -1095,6 +1110,7 @@ def _generate_video_script(
 
 
 def _generate_image_assets(
+    vision_model: GenerativeModel,
     video_file_path: str,
     gcs_bucket_name: str,
     gcs_folder_path: str,
@@ -1105,6 +1121,7 @@ def _generate_image_assets(
   """Generates image ad assets for a video variant in a specific format.
 
   Args:
+    vision_model: The generative vision model to use.
     video_file_path: The path to the input video to use.
     gcs_bucket_name: The name of the GCS bucket to upload the assets to.
     gcs_folder_path: The path to the GCS folder to upload the assets to.
@@ -1125,30 +1142,32 @@ def _generate_image_assets(
   assets = []
   try:
     os.makedirs(image_assets_path, exist_ok=True)
-    Utils.execute_subprocess_commands(
-        cmds=[
-            'ffmpeg',
-            '-i',
-            video_file_path,
-            '-vf',
-            'thumbnail',
-            '-vsync',
-            'vfr',
-            str(pathlib.Path(image_assets_path, '%d.png')),
-        ],
-        description=(
-            f'extract image assets for {format_type} type for '
-            f'variant with id {variant_id} using ffmpeg'
-        ),
+    _extract_video_thumbnails(
+        video_file_path=video_file_path,
+        image_assets_path=image_assets_path,
+        variant_id=variant_id,
+        format_type=format_type,
+    )
+    _identify_and_extract_key_frames(
+        vision_model=vision_model,
+        video_file_path=video_file_path,
+        image_assets_path=image_assets_path,
+        gcs_bucket_name=gcs_bucket_name,
+        gcs_folder_path=gcs_folder_path,
+        output_path=output_path,
+        variant_id=variant_id,
+        format_type=format_type,
     )
     assets = [
         f'{ConfigService.GCS_BASE_URL}/{gcs_bucket_name}/'
         f'{parse.quote(gcs_folder_path)}/'
         f'{variant_folder}/{ConfigService.OUTPUT_COMBINATION_ASSETS_DIR}/'
-        f'{format_type}/{image_asset}'
-        for image_asset in os.listdir(image_assets_path)
-        if image_asset.endswith('.png')
+        f'{format_type}/{image_asset}' for image_asset in sorted(
+            os.listdir(image_assets_path), key=lambda asset:
+            int(asset.split('/')[-1].replace('.png', '').replace('.jpg', ''))
+        ) if image_asset.endswith('.png') or image_asset.endswith('.jpg')
     ]
+
     logging.info(
         'ASSETS - Generated %d image assets for variant %d in %s format',
         len(assets),
@@ -1165,6 +1184,110 @@ def _generate_image_assets(
   return assets
 
 
+def _extract_video_thumbnails(
+    video_file_path: str,
+    image_assets_path: str,
+    variant_id: int,
+    format_type: str,
+):
+  """Extracts video thumbnails as image assets for a video in a specific format.
+
+  Args:
+    video_file_path: The path to the input video to use.
+    image_assets_path: The path to store image assets in.
+    variant_id: The id of the variant to render.
+    format_type: The type of the output format (horizontal, vertical, square).
+  """
+  Utils.execute_subprocess_commands(
+      cmds=[
+          'ffmpeg',
+          '-i',
+          video_file_path,
+          '-vf',
+          'thumbnail',
+          '-vsync',
+          'vfr',
+          str(pathlib.Path(image_assets_path, '%d.png')),
+      ],
+      description=(
+          f'extract thumbnails for {format_type} type for '
+          f'variant with id {variant_id} using ffmpeg'
+      ),
+  )
+
+
+def _identify_and_extract_key_frames(
+    vision_model: GenerativeModel,
+    video_file_path: str,
+    image_assets_path: str,
+    gcs_bucket_name: str,
+    gcs_folder_path: str,
+    output_path: str,
+    variant_id: int,
+    format_type: str,
+):
+  """Identifies key frames via Gemini and extracts them from the given video.
+
+  Args:
+    vision_model: The generative vision model to use.
+    video_file_path: The path to the input video to use.
+    image_assets_path: The path to store image assets in.
+    gcs_bucket_name: The name of the GCS bucket to upload the assets to.
+    gcs_folder_path: The path to the GCS folder to upload the assets to.
+    output_path: The path to output to.
+    variant_id: The id of the variant to render.
+    format_type: The type of the output format (horizontal, vertical, square).
+  """
+  results = []
+  try:
+    gcs_video_file_path = video_file_path.replace(f'{output_path}/', '')
+    response = vision_model.generate_content(
+        [
+            Part.from_uri(
+                f'gs://{gcs_bucket_name}/{gcs_folder_path}/'
+                f'{gcs_video_file_path}',
+                mime_type='video/mp4',
+            ),
+            ConfigService.KEY_FRAMES_PROMPT,
+        ],
+        generation_config=ConfigService.KEY_FRAMES_CONFIG,
+        safety_settings=ConfigService.CONFIG_DEFAULT_SAFETY_CONFIG,
+    )
+    if (
+        response.candidates and response.candidates[0].content.parts
+        and response.candidates[0].content.parts[0].text
+    ):
+      text = response.candidates[0].content.parts[0].text
+      results = re.findall(ConfigService.KEY_FRAMES_PATTERN, text, re.MULTILINE)
+    else:
+      logging.warning('ASSETS - Could not identify key frames!')
+  # Execution should continue regardless of the underlying exception
+  # pylint: disable=broad-exception-caught
+  except Exception:
+    logging.exception('Encountered error while identifying key frames!')
+
+  if results:
+    for index, key_frame_timestamp in enumerate(results):
+      Utils.execute_subprocess_commands(
+          cmds=[
+              'ffmpeg',
+              '-ss',
+              key_frame_timestamp,
+              '-i',
+              video_file_path,
+              '-frames:v',
+              '1',
+              '-q:v',
+              '2',
+              str(pathlib.Path(image_assets_path, f'{index+1}.jpg')),
+          ],
+          description=(
+              f'extract key frames for {format_type} type for '
+              f'variant with id {variant_id} using ffmpeg'
+          ),
+      )
+
+
 def _group_consecutive_segments(
     av_segment_ids: Sequence[str],
 ) -> Sequence[Tuple[str, str]]:

diff --git a/service/config/config.py b/service/config/config.py
@@ -219,3 +219,30 @@
 Descriptions:
 
 """
+
+KEY_FRAMES_CONFIG = {
+    'max_output_tokens': 8192,
+    'temperature': 0.2,
+    'top_p': 1,
+}
+KEY_FRAMES_PATTERN = '\[(.*)\].*'
+KEY_FRAMES_PROMPT = """You are an expert in analyzing video ad content for marketing purposes.
+Given a video ad, your task is to identify timestamps of the most important frames in the video. These are frames that are visually impactful, evoke strong emotions, and are most likely to be remembered by viewers.
+
+**Constraints:**
+    * **Accuracy:** It is crucial that the timestamps you provide are as accurate as possible, down to the second. Pay very close attention to the video timeline.
+    * **First and Last Frames:** Always include the first and last frames of the video in your analysis, in addition to other key frames.
+    * **No Motion Blur:** Do not include any frames that exhibit motion blur. All frames must be clear and in focus.
+
+Consider the following factors when making your selections:
+    * *Visual impact:* Frames with striking visuals, unique compositions, or memorable imagery. Prioritize images with vibrant colors, strong contrast, and clear focus. Avoid images with motion blur, poor lighting, or cluttered compositions.
+    * *Emotional resonance:* Frames that elicit strong emotions such as joy, surprise, curiosity, or inspiration. Consider how the visuals, music, and voiceover work together to create an emotional impact.
+    * *Brand and product messaging:* Frames that clearly communicate the brand identity and values, or key selling points for the depicted products.
+    * *Audio cues:* Pay attention to how music, sound effects, and voiceover align with key visuals to emphasize important moments.
+    * *Storytelling:* Identify frames that mark crucial moments in the narrative arc of the ad, such as the introduction of a problem, the climax, and the resolution.
+
+Provide precise timestamps in the format [minutes:seconds]. Once you've identified the timestamps, review the video again to ensure the timestamps accurately correspond to the frames you've described.
+
+Output a list of timestamps along with a brief explanation of why each frame is significant.
+Do not output any other text before or after the timestamps list.
+"""
diff --git a/service/utils/utils.py b/service/utils/utils.py
@@ -140,10 +140,7 @@ def __init__(self, filepath: str):
     self.file_name = file_path.name
     self.file_ext = file_ext[1:]
     self.gcs_folder = str(file_path.parents[0])
-    self.gcs_root_folder = (
-        str(file_path.parents[1])
-        if len(file_path.parents) > 2 else self.gcs_folder
-    )
+    self.gcs_root_folder = str(file_path.parents[-2])
 
     self.video_metadata = VideoMetadata(self.gcs_root_folder)
     self.full_gcs_path = filepath