Added updates to CLIP scene tagging

usc-sail · Apr 19, 2023 · 8775c1f · 8775c1f
1 parent e9f11b6
commit 8775c1f
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -20,10 +20,14 @@ This repository contains the codebase for MovieCLIP: Visual Scene Recognition in
 
 * Please refer to [**README.md**](https://github.com/usc-sail/mica-MovieCLIP/blob/main/split_files/README.md) under the ```data_splits``` folder for instructions on using the MovieCLIP dataset.
 
+## **Visual scene tagging**
+
+* Please refer to [**README.md**](https://github.com/usc-sail/mica-MovieCLIP/blob/main/preprocess_scripts/visual_scene_tagging/README.md) under the ```preprocess_scripts/visual_scene_tagging``` folder for instructions on using the CLIP model for tagging the visual scenes in the MovieCLIP dataset.
+
 ## **To Dos**
 
 - [x] Add the dataset link and instructions for using the MovieCLIP dataset
-- [ ] Add code for tagging using the CLIP model
+- [x] Add code for tagging using the CLIP model
 - [ ] Add code for training the baseline LSTM models
 - [ ] Add code for openmmlab setup and Swin-B model inference
 

diff --git a/preprocess_scripts/visual_scene_tagging/README.md b/preprocess_scripts/visual_scene_tagging/README.md
@@ -0,0 +1,8 @@
+## Visual scnene tagging
+
+* Use the file `../../split_files/MovieCLIP_taxonomy_split.txt` to run the script `visual_scene_tagging.py` to generate the visual scene tags for shots in `source_folder`.
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python clip_scene_tagging.py --label_file ./../split_files/MovieCLIP_taxonomy_split.txt --source_folder <base folder containing the shots subfolder> --output_folder <output folder path containing the json file with nested dictionary containing visual scene classes>
+```
+
diff --git a/preprocess_scripts/visual_scene_tagging/clip_scene_tagging.py b/preprocess_scripts/visual_scene_tagging/clip_scene_tagging.py
@@ -0,0 +1,93 @@
+import os
+from re import I 
+import pandas as pd 
+import numpy as np 
+from tqdm import tqdm
+import clip 
+import torch 
+import cv2 
+import pickle
+from PIL import Image
+import argparse
+import json
+
+def generate_shot_tags(shot_file_name,model,preprocess,text_features,device,label_list):
+
+    #read individual frame and use CLIP individual encoder 
+    vcap=cv2.VideoCapture(shot_file_name)
+    similarity_list=[]
+    while True:
+        ret, frame = vcap.read()
+        if(ret==True): # if it is a valid frame
+            frame=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) #convert BGR to RGB
+            frame=Image.fromarray(frame) #convert BGR image to PIL Image
+            frame = preprocess(frame).unsqueeze(0).to(device) #preprocess the frame
+            with torch.no_grad():
+                image_features = model.encode_image(frame)
+
+            image_features /= image_features.norm(dim=-1, keepdim=True)
+            similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) 
+            similarity_list.append(similarity)
+        else:
+            break
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+
+    similarity_score_tensor_frames=torch.cat(similarity_list,dim=0)
+
+    similarity_score_tensor_scores=similarity_score_tensor_frames.mean(dim=0)
+
+    values, indices = similarity_score_tensor_scores.topk(5)
+
+    label_list_val=[label_list[id] for id in indices]
+    val_list=[val.item() for val in values]
+
+    return(label_list_val,val_list)
+
+
+ap=argparse.ArgumentParser()
+ap.add_argument('--label_file',required=True,help='path to the label file')
+ap.add_argument('--source_folder',required=True,help='path to the source folder')
+ap.add_argument('--output_folder',required=True,help='path to the output folder')
+
+args_list=vars(ap.parse_args())
+
+label_file=args_list['label_file']
+source_folder=args_list['source_folder']
+
+with open(label_file,'r') as f:
+    label_list=f.readlines()
+
+label_list=[label.strip().split("\n")[0] for label in label_list]
+
+#initialize model and extract features for text labels through scene prompts
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = clip.load('ViT-B/32', device)
+text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}, a type of background location") for c in label_list]).to(device)
+with torch.no_grad():
+    text_features = model.encode_text(text_inputs)
+text_features /= text_features.norm(dim=-1, keepdim=True)
+
+shot_subfolders=os.listdir(source_folder)
+
+shot_dict={}
+
+for shot_subfolder in tqdm(shot_subfolders):
+    shot_subfolder_path=os.path.join(source_folder,shot_subfolder)
+    shot_files=os.listdir(shot_subfolder_path)
+
+    shot_temp_dict={}
+    for shot_file in tqdm(shot_files):
+        shot_file_path=os.path.join(shot_subfolder_path,shot_file)
+        label_list_val,val_list=generate_shot_tags(shot_file_path,model,preprocess,text_features,device,label_list)
+
+        #create a dictionary with the labels and the values
+        shot_tags_dict={k:v for k,v in zip(label_list_val,val_list)}
+
+        shot_temp_dict[shot_file]=shot_tags_dict
+
+    shot_dict[shot_subfolder]=shot_temp_dict
+
+output_file_name=os.path.join(args_list['output_folder'],'shot_tags.json')
+with open(output_file_name,'w') as f:
+    json.dump(shot_dict,f)