Skip to content

Commit

Permalink
Added clip feature extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
digbose92 committed Apr 2, 2023
1 parent edf842b commit fd1ad48
Show file tree
Hide file tree
Showing 21 changed files with 719 additions and 22 deletions.
47 changes: 47 additions & 0 deletions configs/config_MHA_single_task_classifier_audio_only.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
data:
csv_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/SAIM_data/SAIM_multi_task_tone_soc_message_topic_data_no_zero_files.csv'
embedding_file: '/data/digbose92/ads_complete_repo/ads_features/ast_embeddings/ast_embs_0.5.pkl'

parameters:
batch_size: 16
train_shuffle: True
val_shuffle: False
epochs: 50
early_stop: 5
max_length: 14
num_workers: 0
task_name: 'social_message'

device:
is_cuda: True

loss:
loss_option: 'bce_cross_entropy_loss'

optimizer:
choice: 'Adam'
lr: 1e-4
gamma: 0.95
step_size: 15
#scheduler: 'step_lr'
mode: 'max'
decay: 0.001
patience: 5
factor: 0.5
verbose: True

model:
model_type: 'MHA_attn_single_task_classifier_audio_only'
input_dim: 768
model_dim: 256
num_heads: 2
num_layers: 2
input_dropout: 0.2
output_dropout: 0.2
model_dropout: 0.2
n_classes: 2
batch_first: True

output:
model_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/model_dir'
log_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/log_dir'
47 changes: 47 additions & 0 deletions configs/config_MHA_topic_classifier_audio_only.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
data:
csv_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/SAIM_data/SAIM_multi_task_tone_soc_message_topic_data_no_zero_files.csv'
embedding_file: '/data/digbose92/ads_complete_repo/ads_features/ast_embeddings/ast_embs_0.5.pkl'
topic_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/topic_list_18.json'

parameters:
batch_size: 16
train_shuffle: True
val_shuffle: False
epochs: 50
early_stop: 5
max_length: 14
num_workers: 0
task_name: 'Topic'

device:
is_cuda: True

loss:
loss_option: 'cross_entropy_loss'

optimizer:
choice: 'Adam'
lr: 1e-4
gamma: 0.95
step_size: 15
mode: 'max'
decay: 0.001
patience: 5
factor: 0.5
verbose: True

model:
model_type: 'MHA_attn_single_task_classifier_audio_only'
input_dim: 768
model_dim: 256
num_heads: 4
num_layers: 4
input_dropout: 0.2
output_dropout: 0.2
model_dropout: 0.2
n_classes: 18
batch_first: True

output:
model_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/model_dir'
log_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/log_dir'
48 changes: 48 additions & 0 deletions configs/config_MHA_topic_classifier_shot_level.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
data:
csv_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/SAIM_data/SAIM_multi_task_tone_soc_message_topic_data_no_zero_files.csv'
topic_file: '/data/digbose92/ads_complete_repo/ads_codes/SAIM-ADS/data/topic_list_18.json'
base_folder: '/data/digbose92/ads_complete_repo/ads_features/shot_embeddings/vit_features'

parameters:
batch_size: 16
train_shuffle: True
val_shuffle: False
epochs: 50
early_stop: 5
max_length: 35
num_workers: 1
task_name: 'Topic'

device:
is_cuda: True

loss:
loss_option: 'cross_entropy_loss'

optimizer:
choice: 'AdamW'
lr: 1e-4
gamma: 0.5
step_size: 15
#scheduler: 'step_lr'
#mode: 'max'
decay: 0.001
patience: 5
factor: 0.5
verbose: True

model:
model_type: 'MHA_attn_single_task_classifier_shot_level'
input_dim: 768
model_dim: 128
num_heads: 4
num_layers: 4
input_dropout: 0.2
output_dropout: 0.2
model_dropout: 0.2
n_classes: 18
batch_first: True

output:
model_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/model_dir'
log_dir: '/data/digbose92/ads_complete_repo/ads_codes/model_files/recent_models/log_dir'
File renamed without changes.
Binary file modified datasets/__pycache__/dataset.cpython-38.pyc
Binary file not shown.
77 changes: 75 additions & 2 deletions datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,6 @@ def __getitem__(self,idx):
return(clip_feature_array_padded,ret_label,attention_mask)

### dataset for shot level modeling single task ###

class SAIM_single_task_dataset_shot_level(Dataset):

def __init__(self,csv_data,base_folder,label_map,num_classes,max_length,task_name):
Expand All @@ -230,7 +229,7 @@ def __init__(self,csv_data,base_folder,label_map,num_classes,max_length,task_nam
self.task_name=task_name

def __len__(self):
return(len(self.shot_feature_list))
return(len(self.csv_data))

def pad_data(self,feat_data):

Expand Down Expand Up @@ -289,7 +288,81 @@ def __getitem__(self,idx):
#return the shot features, return label and attention mask
return(shot_feat_padded,ret_label,attention_mask)

### dataset for audio only modeling single task ###
class SAIM_single_task_dataset_audio_only(Dataset): #audio only dataset

def __init__(self,csv_data,embedding_file,label_map,num_classes,max_length,task_name):

#arguments here
self.csv_data=csv_data
self.num_classes=num_classes
self.max_length=max_length
self.clip_feature_list=self.csv_data['clip_feature_path'].tolist()
self.label_map=label_map
self.task_name=task_name
self.embedding_file=embedding_file
#print(self.task_name)

#load the embedding file
with open(self.embedding_file, 'rb') as f:
self.embedding = pickle.load(f)

#ast embeddings
self.ast_embeds=self.embedding['data']['embeddings']

#get the keys
self.clip_keys=[os.path.splitext(file.split("/")[-1])[0] for file in self.clip_feature_list]

def __len__(self):
#csv data
return(len(self.csv_data))

def pad_data(self,feat_data):

#padded data and attention mask
padded=np.zeros((self.max_length,feat_data.shape[1]))

if(feat_data.shape[0]>self.max_length):
padded=feat_data[:self.max_length,:]
attn_mask=np.ones((self.max_length))
else:
attn_mask=np.zeros((self.max_length))
padded[:feat_data.shape[0],:]=feat_data
attn_mask[:feat_data.shape[0]]=1

return(padded,attn_mask)

def __getitem__(self,idx):

#get the clip key
clip_key=self.clip_keys[idx]

#get the audio features
audio_feat=self.ast_embeds[clip_key].cpu().numpy()

#pad the data
audio_feat_padded,attention_mask=self.pad_data(audio_feat)

#get the label
if((self.task_name=='social_message') or (self.task_name=='Transition_val')):

label_c=self.label_map[self.csv_data[self.task_name].iloc[idx]]
ret_label=np.zeros((self.num_classes))
ret_label[label_c]=1

elif(self.task_name=='Topic'):
ret_label=self.label_map[self.csv_data[self.task_name].iloc[idx]]

#return the shot features, return label and attention mask
return(audio_feat_padded,ret_label,attention_mask)











Expand Down
95 changes: 83 additions & 12 deletions feature_extraction/extract_clip_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import cv2
from tqdm import tqdm
import numpy as np
import math
#extract from the videos directly

def generate_video_prediction(model,preprocess,device,video_file,batch_size=32):
Expand Down Expand Up @@ -121,11 +122,8 @@ def generate_predictions_clip_ads(model,preprocess,frames_location,key,device,ba


#encode the features with keys in the dictionary



# with torch.no_grad():
#image_features = model.encode_image(image)
# with torch.no_grad():
#image_features = model.encode_image(image)
dict_frame_list={'Features':feature_list,'Keys':key_list}
assert(dict_frame_list['Features'].shape[0]==len(key_list))
#print(key_list[0:1000])
Expand Down Expand Up @@ -184,21 +182,94 @@ def run_video_inference_updated(video_list,model,device,preprocess,destination_f
with open(vid_destination_file,"wb") as f:
pickle.dump(dict_temp,f)

#print('Number of files processed for the updated set: %d' %(cnt_num_files))
def run_frame_wise_feature_inference_reduced_fps(model,preprocess,device,shot_filename,dim=512,frameRate=24,desired_frameRate=4):

#video filename + frame rate
vcap=cv2.VideoCapture(shot_filename)
frameRate = vcap.get(5)
intfactor=math.ceil(frameRate/desired_frameRate)
feature_list=np.zeros((0,dim))
frame_id=0

length = int(vcap.get(cv2.CAP_PROP_FRAME_COUNT))

while True:
ret, frame = vcap.read()
if(ret==True):
if (frame_id % intfactor == 0):

frame=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
frame=Image.fromarray(frame)
frame = preprocess(frame).unsqueeze(0).to(device) #add to device

#model encoding images
with torch.no_grad():
image_features = model.encode_image(frame)

#image features to numpy array
image_features=image_features.cpu().numpy()
feature_list=np.vstack([feature_list,image_features]) #add the features to the numpy array

torch.cuda.empty_cache()
frame_id=frame_id+1
else:
break
if cv2.waitKey(1) & 0xFF == ord('q'):
break

return feature_list, frame_id

def run_shot_inference(shot_folder,model,device,preprocess,destination_folder):

#extract at 4 fps features for every shot using CLIP embeddings
#create a single dictionary with keys as shot names and values as features

shot_list=os.listdir(shot_folder)

for video_file in tqdm(shot_list):

#video subfolder and pkl filename
video_subfolder=os.path.join(shot_folder,video_file)
pkl_filename=video_file+'.pkl'

#destination file
destination_file=os.path.join(destination_folder,pkl_filename)

if(os.path.exists(destination_file) is False):

shot_list=os.listdir(video_subfolder) #list of shots
shot_dict=dict()
for shot_file in tqdm(shot_list):

shot_filename=os.path.join(video_subfolder,shot_file) #shot filename
feat_list,_=run_frame_wise_feature_inference_reduced_fps(model,preprocess,device,shot_filename) #list of features
#print(feat_list.shape)
shot_dict[shot_file]=feat_list # dictionary containing shot filename and features

#save the shot_dict to a pickle file
with open(destination_file,'wb') as f:
pickle.dump(shot_dict,f)

# else:
# print('Already exists',pkl_filename)

if __name__=='__main__':

feature_destination_folder="/data/digbose92/ads_complete_repo/ads_features/clip_embeddings/jwt_ads_of_world"
#feature_destination_folder="/data/digbose92/ads_complete_repo/ads_features/clip_embeddings/jwt_ads_of_world"
# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)
shot_folder="/data/digbose92/ads_complete_repo/ads_videos/shot_folder/PySceneDetect"
destination_folder="/data/digbose92/ads_complete_repo/ads_features/shot_embeddings/clip_features_4fps"
run_shot_inference(shot_folder,model,device,preprocess,destination_folder)

feature_file="/data/digbose92/ads_complete_repo/ads_codes/updated_pkl_files/corrected_files/zero_clip_features_files.pkl"
#"/data/digbose92/ads_complete_repo/ads_codes/updated_pkl_files/corrected_files/file_list_clip_features_extraction_remaining.pkl"
with open(feature_file, "rb") as f:
video_file_list=pickle.load(f)

run_video_inference_updated(video_file_list,model,device,preprocess,feature_destination_folder)
# feature_file="/data/digbose92/ads_complete_repo/ads_codes/updated_pkl_files/corrected_files/zero_clip_features_files.pkl"
# #"/data/digbose92/ads_complete_repo/ads_codes/updated_pkl_files/corrected_files/file_list_clip_features_extraction_remaining.pkl"
# with open(feature_file, "rb") as f:
# video_file_list=pickle.load(f)

# run_video_inference_updated(video_file_list,model,device,preprocess,feature_destination_folder)



Expand Down
3 changes: 0 additions & 3 deletions feature_extraction/extract_vit_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,6 @@ def run_frame_wise_feature_inference(model,processor,filename,device,dim=768,des
processor=ViTFeatureExtractor.from_pretrained(model_name)
#print layer wise names
#declare the transforms


print('Loaded model')
#load the model along with the logits
# h1 = model.pre_logits.register_forward_hook(getActivation('pre_logits'))
Expand All @@ -98,7 +96,6 @@ def run_frame_wise_feature_inference(model,processor,filename,device,dim=768,des


#read the list of shot files already processed

with open(shot_file_list,'r') as f:
shot_filenames=f.readlines()

Expand Down
Loading

0 comments on commit fd1ad48

Please sign in to comment.