Skip to content

Commit

Permalink
Implement key frames logic for DG image assets
Browse files Browse the repository at this point in the history
We now identify key frames via Gemini for all generated variants and extract those frames as additional Demand Gen image assets.
The existing logic of extracting video thumbnails via ffmpeg (see https://ffmpeg.org/ffmpeg-filters.html#thumbnail) remains the same.
This means that users may observe duplicate DG assets (as both thumbnails and Gemini may select the same frame), but that should not be a problem as users can freely select the image assets they want to use.

This change also fixes an issue with `utils.py` where VideoMetadata could not be initialised properly for folders with more than 2 subdirectories.
This has now been fixed, as the correct folder we need (root GCS folder) is always the penultimate element in the `.parents` list of the file path.

Change-Id: I0383a34103c40578776fedaa591bb6e9593c4000
  • Loading branch information
mohabfekry committed Jan 10, 2025
1 parent 29cf6ac commit 4636639
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 22 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ limitations under the License.
Update to the latest version by running `npm run update-app` after pulling the latest changes from the repository via `git pull --rebase --autostash`; you would need to redploy the *UI* for features marked as `frontend`, and *GCP components* for features marked as `backend`.

* [January 2025] Happy New Year!
* `backend`: Added functionality to identify key frames using Gemini and extract them as additional Demand Gen image assets.
* `backend`: Improved the extraction process to maintain consistency across the generated descriptions and keywords per segment.
* [December 2024]
* `frontend`: Added functionality to generate Demand Gen text assets in a desired target language. Read more [here](#6-output-videos).
Expand Down
159 changes: 141 additions & 18 deletions service/combiner/combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,13 @@ def _render_video_variant(
}
}
if video_variant.render_settings.generate_image_assets:
StorageService.upload_gcs_dir(
source_directory=output_dir,
bucket_name=gcs_bucket_name,
target_dir=gcs_folder_path,
)
assets = _generate_image_assets(
vision_model=vision_model,
video_file_path=horizontal_combo_path,
gcs_bucket_name=gcs_bucket_name,
gcs_folder_path=gcs_folder_path,
Expand Down Expand Up @@ -774,6 +780,7 @@ def _render_video_variant(
continuous_audio_select_filter=continuous_audio_select_filter,
)
rendered_paths[format_type] = _render_format(
vision_model=vision_model,
input_video_path=horizontal_combo_path,
output_path=output_dir,
gcs_bucket_name=gcs_bucket_name,
Expand Down Expand Up @@ -865,6 +872,7 @@ def _get_variant_ffmpeg_commands(


def _render_format(
vision_model: GenerativeModel,
input_video_path: str,
output_path: str,
gcs_bucket_name: str,
Expand All @@ -878,6 +886,7 @@ def _render_format(
"""Renders a video variant in a specific format.
Args:
vision_model: The generative vision model to use.
input_video_path: The path to the input video to render.
output_path: The path to output to.
gcs_bucket_name: The name of the GCS bucket to upload to.
Expand Down Expand Up @@ -925,7 +934,13 @@ def _render_format(
'path': format_name,
}
if generate_image_assets:
StorageService.upload_gcs_dir(
source_directory=output_path,
bucket_name=gcs_bucket_name,
target_dir=gcs_folder_path,
)
assets = _generate_image_assets(
vision_model=vision_model,
video_file_path=output_video_path,
gcs_bucket_name=gcs_bucket_name,
gcs_folder_path=gcs_folder_path,
Expand Down Expand Up @@ -1095,6 +1110,7 @@ def _generate_video_script(


def _generate_image_assets(
vision_model: GenerativeModel,
video_file_path: str,
gcs_bucket_name: str,
gcs_folder_path: str,
Expand All @@ -1105,6 +1121,7 @@ def _generate_image_assets(
"""Generates image ad assets for a video variant in a specific format.
Args:
vision_model: The generative vision model to use.
video_file_path: The path to the input video to use.
gcs_bucket_name: The name of the GCS bucket to upload the assets to.
gcs_folder_path: The path to the GCS folder to upload the assets to.
Expand All @@ -1125,30 +1142,32 @@ def _generate_image_assets(
assets = []
try:
os.makedirs(image_assets_path, exist_ok=True)
Utils.execute_subprocess_commands(
cmds=[
'ffmpeg',
'-i',
video_file_path,
'-vf',
'thumbnail',
'-vsync',
'vfr',
str(pathlib.Path(image_assets_path, '%d.png')),
],
description=(
f'extract image assets for {format_type} type for '
f'variant with id {variant_id} using ffmpeg'
),
_extract_video_thumbnails(
video_file_path=video_file_path,
image_assets_path=image_assets_path,
variant_id=variant_id,
format_type=format_type,
)
_identify_and_extract_key_frames(
vision_model=vision_model,
video_file_path=video_file_path,
image_assets_path=image_assets_path,
gcs_bucket_name=gcs_bucket_name,
gcs_folder_path=gcs_folder_path,
output_path=output_path,
variant_id=variant_id,
format_type=format_type,
)
assets = [
f'{ConfigService.GCS_BASE_URL}/{gcs_bucket_name}/'
f'{parse.quote(gcs_folder_path)}/'
f'{variant_folder}/{ConfigService.OUTPUT_COMBINATION_ASSETS_DIR}/'
f'{format_type}/{image_asset}'
for image_asset in os.listdir(image_assets_path)
if image_asset.endswith('.png')
f'{format_type}/{image_asset}' for image_asset in sorted(
os.listdir(image_assets_path), key=lambda asset:
int(asset.split('/')[-1].replace('.png', '').replace('.jpg', ''))
) if image_asset.endswith('.png') or image_asset.endswith('.jpg')
]

logging.info(
'ASSETS - Generated %d image assets for variant %d in %s format',
len(assets),
Expand All @@ -1165,6 +1184,110 @@ def _generate_image_assets(
return assets


def _extract_video_thumbnails(
video_file_path: str,
image_assets_path: str,
variant_id: int,
format_type: str,
):
"""Extracts video thumbnails as image assets for a video in a specific format.
Args:
video_file_path: The path to the input video to use.
image_assets_path: The path to store image assets in.
variant_id: The id of the variant to render.
format_type: The type of the output format (horizontal, vertical, square).
"""
Utils.execute_subprocess_commands(
cmds=[
'ffmpeg',
'-i',
video_file_path,
'-vf',
'thumbnail',
'-vsync',
'vfr',
str(pathlib.Path(image_assets_path, '%d.png')),
],
description=(
f'extract thumbnails for {format_type} type for '
f'variant with id {variant_id} using ffmpeg'
),
)


def _identify_and_extract_key_frames(
vision_model: GenerativeModel,
video_file_path: str,
image_assets_path: str,
gcs_bucket_name: str,
gcs_folder_path: str,
output_path: str,
variant_id: int,
format_type: str,
):
"""Identifies key frames via Gemini and extracts them from the given video.
Args:
vision_model: The generative vision model to use.
video_file_path: The path to the input video to use.
image_assets_path: The path to store image assets in.
gcs_bucket_name: The name of the GCS bucket to upload the assets to.
gcs_folder_path: The path to the GCS folder to upload the assets to.
output_path: The path to output to.
variant_id: The id of the variant to render.
format_type: The type of the output format (horizontal, vertical, square).
"""
results = []
try:
gcs_video_file_path = video_file_path.replace(f'{output_path}/', '')
response = vision_model.generate_content(
[
Part.from_uri(
f'gs://{gcs_bucket_name}/{gcs_folder_path}/'
f'{gcs_video_file_path}',
mime_type='video/mp4',
),
ConfigService.KEY_FRAMES_PROMPT,
],
generation_config=ConfigService.KEY_FRAMES_CONFIG,
safety_settings=ConfigService.CONFIG_DEFAULT_SAFETY_CONFIG,
)
if (
response.candidates and response.candidates[0].content.parts
and response.candidates[0].content.parts[0].text
):
text = response.candidates[0].content.parts[0].text
results = re.findall(ConfigService.KEY_FRAMES_PATTERN, text, re.MULTILINE)
else:
logging.warning('ASSETS - Could not identify key frames!')
# Execution should continue regardless of the underlying exception
# pylint: disable=broad-exception-caught
except Exception:
logging.exception('Encountered error while identifying key frames!')

if results:
for index, key_frame_timestamp in enumerate(results):
Utils.execute_subprocess_commands(
cmds=[
'ffmpeg',
'-ss',
key_frame_timestamp,
'-i',
video_file_path,
'-frames:v',
'1',
'-q:v',
'2',
str(pathlib.Path(image_assets_path, f'{index+1}.jpg')),
],
description=(
f'extract key frames for {format_type} type for '
f'variant with id {variant_id} using ffmpeg'
),
)


def _group_consecutive_segments(
av_segment_ids: Sequence[str],
) -> Sequence[Tuple[str, str]]:
Expand Down
27 changes: 27 additions & 0 deletions service/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,3 +219,30 @@
Descriptions:
"""

KEY_FRAMES_CONFIG = {
'max_output_tokens': 8192,
'temperature': 0.2,
'top_p': 1,
}
KEY_FRAMES_PATTERN = '\[(.*)\].*'
KEY_FRAMES_PROMPT = """You are an expert in analyzing video ad content for marketing purposes.
Given a video ad, your task is to identify timestamps of the most important frames in the video. These are frames that are visually impactful, evoke strong emotions, and are most likely to be remembered by viewers.
**Constraints:**
* **Accuracy:** It is crucial that the timestamps you provide are as accurate as possible, down to the second. Pay very close attention to the video timeline.
* **First and Last Frames:** Always include the first and last frames of the video in your analysis, in addition to other key frames.
* **No Motion Blur:** Do not include any frames that exhibit motion blur. All frames must be clear and in focus.
Consider the following factors when making your selections:
* *Visual impact:* Frames with striking visuals, unique compositions, or memorable imagery. Prioritize images with vibrant colors, strong contrast, and clear focus. Avoid images with motion blur, poor lighting, or cluttered compositions.
* *Emotional resonance:* Frames that elicit strong emotions such as joy, surprise, curiosity, or inspiration. Consider how the visuals, music, and voiceover work together to create an emotional impact.
* *Brand and product messaging:* Frames that clearly communicate the brand identity and values, or key selling points for the depicted products.
* *Audio cues:* Pay attention to how music, sound effects, and voiceover align with key visuals to emphasize important moments.
* *Storytelling:* Identify frames that mark crucial moments in the narrative arc of the ad, such as the introduction of a problem, the climax, and the resolution.
Provide precise timestamps in the format [minutes:seconds]. Once you've identified the timestamps, review the video again to ensure the timestamps accurately correspond to the frames you've described.
Output a list of timestamps along with a brief explanation of why each frame is significant.
Do not output any other text before or after the timestamps list.
"""
5 changes: 1 addition & 4 deletions service/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,7 @@ def __init__(self, filepath: str):
self.file_name = file_path.name
self.file_ext = file_ext[1:]
self.gcs_folder = str(file_path.parents[0])
self.gcs_root_folder = (
str(file_path.parents[1])
if len(file_path.parents) > 2 else self.gcs_folder
)
self.gcs_root_folder = str(file_path.parents[-2])

self.video_metadata = VideoMetadata(self.gcs_root_folder)
self.full_gcs_path = filepath
Expand Down

0 comments on commit 4636639

Please sign in to comment.