Added clip feature extraction

usc-sail · Apr 2, 2023 · fd1ad48 · fd1ad48
1 parent edf842b
commit fd1ad48
Show file tree

Hide file tree

Showing 21 changed files with 719 additions and 22 deletions.
diff --git a/configs/config_MHA_single_task_classifier_audio_only.yaml b/configs/config_MHA_single_task_classifier_audio_only.yaml
@@ -0,0 +1,47 @@
+data:
+  csv_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/SAIM_data/SAIM_multi_task_tone_soc_message_topic_data_no_zero_files.csv'
+  embedding_file: '/data/digbose92/ads_complete_repo/ads_features/ast_embeddings/ast_embs_0.5.pkl'
+
+parameters:
+  batch_size: 16
+  train_shuffle: True
+  val_shuffle: False
+  epochs: 50
+  early_stop: 5
+  max_length: 14
+  num_workers: 0
+  task_name: 'social_message'
+
+device:
+  is_cuda: True
+
+loss:
+  loss_option: 'bce_cross_entropy_loss'
+
+optimizer:
+  choice: 'Adam'
+  lr: 1e-4
+  gamma: 0.95
+  step_size: 15
+  #scheduler: 'step_lr'
+  mode: 'max'
+  decay: 0.001
+  patience: 5
+  factor: 0.5
+  verbose: True
+
+model:
+  model_type: 'MHA_attn_single_task_classifier_audio_only'
+  input_dim: 768
+  model_dim: 256
+  num_heads: 2
+  num_layers: 2
+  input_dropout: 0.2
+  output_dropout: 0.2
+  model_dropout: 0.2
+  n_classes: 2
+  batch_first: True
+
+output:
+  model_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/model_dir'
+  log_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/log_dir'
diff --git a/...gs/config_MHA_single_task_classifier.yaml → ...A_single_task_classifier_visual_only.yaml b/...gs/config_MHA_single_task_classifier.yaml → ...A_single_task_classifier_visual_only.yaml
diff --git a/configs/config_MHA_topic_classifier_audio_only.yaml b/configs/config_MHA_topic_classifier_audio_only.yaml
@@ -0,0 +1,47 @@
+data:
+  csv_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/SAIM_data/SAIM_multi_task_tone_soc_message_topic_data_no_zero_files.csv'
+  embedding_file: '/data/digbose92/ads_complete_repo/ads_features/ast_embeddings/ast_embs_0.5.pkl'
+  topic_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/topic_list_18.json'
+
+parameters:
+  batch_size: 16
+  train_shuffle: True
+  val_shuffle: False
+  epochs: 50
+  early_stop: 5
+  max_length: 14
+  num_workers: 0
+  task_name: 'Topic'
+
+device:
+  is_cuda: True
+
+loss:
+  loss_option: 'cross_entropy_loss'
+
+optimizer:
+  choice: 'Adam'
+  lr: 1e-4
+  gamma: 0.95
+  step_size: 15
+  mode: 'max'
+  decay: 0.001
+  patience: 5
+  factor: 0.5
+  verbose: True
+
+model:
+  model_type: 'MHA_attn_single_task_classifier_audio_only'
+  input_dim: 768
+  model_dim: 256
+  num_heads: 4
+  num_layers: 4
+  input_dropout: 0.2
+  output_dropout: 0.2
+  model_dropout: 0.2
+  n_classes: 18
+  batch_first: True
+
+output:
+  model_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/model_dir'
+  log_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/log_dir'
diff --git a/configs/config_MHA_topic_classifier_shot_level.yaml b/configs/config_MHA_topic_classifier_shot_level.yaml
@@ -0,0 +1,48 @@
+data:
+  csv_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/SAIM_data/SAIM_multi_task_tone_soc_message_topic_data_no_zero_files.csv'
+  topic_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/topic_list_18.json'
+  base_folder: '/data/digbose92/ads_complete_repo/ads_features/shot_embeddings/vit_features'
+
+parameters:
+  batch_size: 16
+  train_shuffle: True
+  val_shuffle: False
+  epochs: 50
+  early_stop: 5
+  max_length: 35
+  num_workers: 1
+  task_name: 'Topic'
+
+device:
+  is_cuda: True
+
+loss:
+  loss_option: 'cross_entropy_loss'
+
+optimizer:
+  choice: 'AdamW'
+  lr: 1e-4
+  gamma: 0.5
+  step_size: 15
+  #scheduler: 'step_lr'
+  #mode: 'max'
+  decay: 0.001
+  patience: 5
+  factor: 0.5
+  verbose: True
+
+model:
+  model_type: 'MHA_attn_single_task_classifier_shot_level'
+  input_dim: 768
+  model_dim: 128
+  num_heads: 4
+  num_layers: 4
+  input_dropout: 0.2
+  output_dropout: 0.2
+  model_dropout: 0.2
+  n_classes: 18
+  batch_first: True
+
+output:
+  model_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/model_dir'
+  log_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/log_dir'
diff --git a/configs/config_MHA_topic_classifier.yaml → ...fig_MHA_topic_classifier_visual_only.yaml b/configs/config_MHA_topic_classifier.yaml → ...fig_MHA_topic_classifier_visual_only.yaml
diff --git a/datasets/__pycache__/dataset.cpython-38.pyc b/datasets/__pycache__/dataset.cpython-38.pyc
diff --git a/datasets/dataset.py b/datasets/dataset.py
@@ -210,7 +210,6 @@ def __getitem__(self,idx):
         return(clip_feature_array_padded,ret_label,attention_mask)
 
 ### dataset for shot level modeling single task ###
-
 class SAIM_single_task_dataset_shot_level(Dataset):
 
     def __init__(self,csv_data,base_folder,label_map,num_classes,max_length,task_name):  
@@ -230,7 +229,7 @@ def __init__(self,csv_data,base_folder,label_map,num_classes,max_length,task_nam
         self.task_name=task_name
 
     def __len__(self):
-        return(len(self.shot_feature_list))
+        return(len(self.csv_data))
 
     def pad_data(self,feat_data):
 
@@ -289,7 +288,81 @@ def __getitem__(self,idx):
         #return the shot features, return label and attention mask
         return(shot_feat_padded,ret_label,attention_mask)
 
+### dataset for audio only modeling single task ###
+class SAIM_single_task_dataset_audio_only(Dataset): #audio only dataset
+
+    def __init__(self,csv_data,embedding_file,label_map,num_classes,max_length,task_name):
+
+        #arguments here
+        self.csv_data=csv_data
+        self.num_classes=num_classes
+        self.max_length=max_length
+        self.clip_feature_list=self.csv_data['clip_feature_path'].tolist()
+        self.label_map=label_map
+        self.task_name=task_name
+        self.embedding_file=embedding_file
+        #print(self.task_name)
+
+        #load the embedding file
+        with open(self.embedding_file, 'rb') as f:
+            self.embedding = pickle.load(f)
+
+        #ast embeddings
+        self.ast_embeds=self.embedding['data']['embeddings']
+
+        #get the keys
+        self.clip_keys=[os.path.splitext(file.split("/")[-1])[0] for file in self.clip_feature_list]
+
+    def __len__(self):
+        #csv data
+        return(len(self.csv_data))
+
+    def pad_data(self,feat_data):
+
+        #padded data and attention mask 
+        padded=np.zeros((self.max_length,feat_data.shape[1]))
+
+        if(feat_data.shape[0]>self.max_length):
+            padded=feat_data[:self.max_length,:]
+            attn_mask=np.ones((self.max_length))
+        else:
+            attn_mask=np.zeros((self.max_length))
+            padded[:feat_data.shape[0],:]=feat_data
+            attn_mask[:feat_data.shape[0]]=1
+
+        return(padded,attn_mask)
+
+    def __getitem__(self,idx):
+
+        #get the clip key
+        clip_key=self.clip_keys[idx]
+
+        #get the audio features
+        audio_feat=self.ast_embeds[clip_key].cpu().numpy()
+
+        #pad the data
+        audio_feat_padded,attention_mask=self.pad_data(audio_feat)
+
+         #get the label
+        if((self.task_name=='social_message') or (self.task_name=='Transition_val')):
+
+            label_c=self.label_map[self.csv_data[self.task_name].iloc[idx]]
+            ret_label=np.zeros((self.num_classes))
+            ret_label[label_c]=1
+
+        elif(self.task_name=='Topic'):
+            ret_label=self.label_map[self.csv_data[self.task_name].iloc[idx]]
+
+        #return the shot features, return label and attention mask
+        return(audio_feat_padded,ret_label,attention_mask)
+
+
+
+
+
+
 
+
 
 
 

diff --git a/feature_extraction/extract_clip_features.py b/feature_extraction/extract_clip_features.py
@@ -10,6 +10,7 @@
 import cv2 
 from tqdm import tqdm
 import numpy as np  
+import math
 #extract from the videos directly 
 
 def generate_video_prediction(model,preprocess,device,video_file,batch_size=32):
@@ -121,11 +122,8 @@ def generate_predictions_clip_ads(model,preprocess,frames_location,key,device,ba
 
 
         #encode the features with keys in the dictionary
-
-
-
-        # with torch.no_grad():
-        #image_features = model.encode_image(image)
+    # with torch.no_grad():
+    #image_features = model.encode_image(image)
     dict_frame_list={'Features':feature_list,'Keys':key_list}
     assert(dict_frame_list['Features'].shape[0]==len(key_list))
     #print(key_list[0:1000])
@@ -184,21 +182,94 @@ def run_video_inference_updated(video_list,model,device,preprocess,destination_f
         with open(vid_destination_file,"wb") as f:
             pickle.dump(dict_temp,f)
 
-    #print('Number of files processed for the updated set: %d' %(cnt_num_files))
+def run_frame_wise_feature_inference_reduced_fps(model,preprocess,device,shot_filename,dim=512,frameRate=24,desired_frameRate=4):
+
+    #video filename + frame rate
+    vcap=cv2.VideoCapture(shot_filename)
+    frameRate = vcap.get(5)
+    intfactor=math.ceil(frameRate/desired_frameRate)
+    feature_list=np.zeros((0,dim))
+    frame_id=0
+
+    length = int(vcap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    while True:
+        ret, frame = vcap.read()
+        if(ret==True):
+            if (frame_id % intfactor == 0):
+
+                frame=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
+                frame=Image.fromarray(frame)
+                frame = preprocess(frame).unsqueeze(0).to(device) #add to device 
+
+                #model encoding images
+                with torch.no_grad():
+                    image_features = model.encode_image(frame)
+
+                #image features to numpy array
+                image_features=image_features.cpu().numpy()
+                feature_list=np.vstack([feature_list,image_features]) #add the features to the numpy array
+
+                torch.cuda.empty_cache()
+            frame_id=frame_id+1
+        else:
+            break
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+
+    return feature_list, frame_id
+
+def run_shot_inference(shot_folder,model,device,preprocess,destination_folder):
+
+    #extract at 4 fps features for every shot using CLIP embeddings 
+    #create a single dictionary with keys as shot names and values as features
+
+    shot_list=os.listdir(shot_folder)
+
+    for video_file in tqdm(shot_list):
+
+            #video subfolder and pkl filename
+            video_subfolder=os.path.join(shot_folder,video_file)
+            pkl_filename=video_file+'.pkl'
+
+            #destination file
+            destination_file=os.path.join(destination_folder,pkl_filename)
+
+            if(os.path.exists(destination_file) is False):
+
+                shot_list=os.listdir(video_subfolder) #list of shots
+                shot_dict=dict()
+                for shot_file in tqdm(shot_list):
+
+                    shot_filename=os.path.join(video_subfolder,shot_file) #shot filename
+                    feat_list,_=run_frame_wise_feature_inference_reduced_fps(model,preprocess,device,shot_filename) #list of features
+                    #print(feat_list.shape)
+                    shot_dict[shot_file]=feat_list # dictionary containing shot filename and features                
+
+                    #save the shot_dict to a pickle file 
+                    with open(destination_file,'wb') as f:
+                        pickle.dump(shot_dict,f)
+
+            # else:
+            #     print('Already exists',pkl_filename)
 
 if __name__=='__main__':
 
-    feature_destination_folder="/data/digbose92/ads_complete_repo/ads_features/clip_embeddings/jwt_ads_of_world"
+    #feature_destination_folder="/data/digbose92/ads_complete_repo/ads_features/clip_embeddings/jwt_ads_of_world"
     # Load the model
     device = "cuda" if torch.cuda.is_available() else "cpu"
     model, preprocess = clip.load('ViT-B/32', device)
+    shot_folder="/data/digbose92/ads_complete_repo/ads_videos/shot_folder/PySceneDetect"
+    destination_folder="/data/digbose92/ads_complete_repo/ads_features/shot_embeddings/clip_features_4fps"
+    run_shot_inference(shot_folder,model,device,preprocess,destination_folder)
 
-    feature_file="/data/digbose92/ads_complete_repo/ads_codes/updated_pkl_files/corrected_files/zero_clip_features_files.pkl"
-    #"/data/digbose92/ads_complete_repo/ads_codes/updated_pkl_files/corrected_files/file_list_clip_features_extraction_remaining.pkl"
-    with open(feature_file, "rb") as f:
-        video_file_list=pickle.load(f)
 
-    run_video_inference_updated(video_file_list,model,device,preprocess,feature_destination_folder)
+    # feature_file="/data/digbose92/ads_complete_repo/ads_codes/updated_pkl_files/corrected_files/zero_clip_features_files.pkl"
+    # #"/data/digbose92/ads_complete_repo/ads_codes/updated_pkl_files/corrected_files/file_list_clip_features_extraction_remaining.pkl"
+    # with open(feature_file, "rb") as f:
+    #     video_file_list=pickle.load(f)
+
+    # run_video_inference_updated(video_file_list,model,device,preprocess,feature_destination_folder)
 
 
 

diff --git a/feature_extraction/extract_vit_features.py b/feature_extraction/extract_vit_features.py
@@ -88,8 +88,6 @@ def run_frame_wise_feature_inference(model,processor,filename,device,dim=768,des
 processor=ViTFeatureExtractor.from_pretrained(model_name)
 #print layer wise names
 #declare the transforms
-
-
 print('Loaded model')
 #load the model along with the logits
 # h1 = model.pre_logits.register_forward_hook(getActivation('pre_logits'))
@@ -98,7 +96,6 @@ def run_frame_wise_feature_inference(model,processor,filename,device,dim=768,des
 
 
     #read the list of shot files already processed 
-
     with open(shot_file_list,'r') as f:
         shot_filenames=f.readlines()