Skip to content

Commit

Permalink
Added updates to movieCLIP data parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
digbose92 committed Apr 3, 2023
1 parent c7a6c29 commit 459144e
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 10 deletions.
46 changes: 46 additions & 0 deletions preprocess_scripts/parse_movieCLIP_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import json
import pandas as pd
import numpy as np
import argparse
from tqdm import tqdm
from collections import Counter

parser = argparse.ArgumentParser()
parser.add_argument('--source_file', type=str, required=True)
parser.add_argument('--destination_folder', type=str, required=True)


args = parser.parse_args()
source_file = args.source_file
dest_folder = args.destination_folder

#read the json file
with open(source_file,'r') as f:
data=json.load(f)

video_keys=list(data.keys())
total_labels_list=[]

for video_key in tqdm(list(video_keys)):

video_data=data[video_key]

for shot_name in list(video_data.keys()):
shot_data=video_data[shot_name]
labels=shot_data['labels']
shot_labels={l for l in list(labels.keys()) if labels[l]>=0.4}
total_labels_list=total_labels_list+list(shot_labels)

#print(shot_data['labels'])
total_labels_counter=Counter(total_labels_list)
total_labels_dict=dict(total_labels_counter)
print(len(total_labels_dict))

#save the total labels dict
with open(os.path.join(dest_folder,'clean_labels_movieCLIP_distribution.json'),'w') as f:
json.dump(total_labels_dict,f,indent=4)




43 changes: 33 additions & 10 deletions preprocess_scripts/preprocess_movieCLIP_json_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import argparse
from tqdm import tqdm
import pickle
from collections import Counter
#json file with the keys as video id and sub keys as the Scene Number
#for each scene number there is a sub-dictionary with labels and scores, Start time and end time

Expand All @@ -25,6 +26,9 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
#assert(len(tag_data)==len(csv_data['Scene Number']))
shot_level_dict={}
num_clean_samples_video=0
total_set_labels=[]


for i in range(len(csv_data['Scene Number'])):

#scene number + start time + end time
Expand All @@ -47,6 +51,7 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
if(len(scores_list)>0):
if(scores_list[0]>=0.4):
num_clean_samples_video+=1
total_set_labels=total_set_labels+labels_list

#create a dict with the labels_list and scores_list
label_dict={ labels_list[i]:scores_list[i] for i in range(len(labels_list))}
Expand All @@ -61,7 +66,7 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):

shot_level_dict[key_name]=temp_dict

return(shot_level_dict,num_clean_samples_video)
return(shot_level_dict,num_clean_samples_video,total_set_labels)

parser=argparse.ArgumentParser()
parser.add_argument('--destination_folder', type=str, required=True)
Expand Down Expand Up @@ -102,6 +107,7 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
total_tagged_samples=0
movieCLIP_dict={}
num_samples=0
total_labels=[]

for file in tqdm(total_segment_files):

Expand Down Expand Up @@ -130,11 +136,12 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
else:

#generate_video_key_wise_dictionary(tag_file_name,shot_csv_data,clip_tag_data)
shot_level_dict,total_clean_samples_per_video=generate_video_key_wise_dictionary(tag_file_name,shot_csv_data,clip_tag_data)
shot_level_dict,total_clean_samples_per_video,total_clean_labels_list=generate_video_key_wise_dictionary(tag_file_name,shot_csv_data,clip_tag_data)
total_clean_samples_video+=total_clean_samples_per_video
#print(tag_file_name)
total_labels=total_labels+total_clean_labels_list #total clean labels list

movieCLIP_dict[tag_file_name]=shot_level_dict
#print(movieCLIP_dict[tag_file_name])

total_tagged_samples+=len(shot_level_dict)
equal_files+=1
num_samples+=1
Expand All @@ -151,13 +158,29 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
print("Total tagged samples: ",total_tagged_samples) #0
print("Total clean samples: ",total_clean_samples_video) #0

#save the empty tagged files list
with open(os.path.join(dest_folder,'empty_tagged_file_list.pkl'),'wb') as f:
pickle.dump(empty_tagged_file_list,f)
# #save the empty tagged files list
# with open(os.path.join(dest_folder,'empty_tagged_file_list.pkl'),'wb') as f:
# pickle.dump(empty_tagged_file_list,f)

# #save the not present list
# with open(os.path.join(dest_folder,'not_equal_list.pkl'),'wb') as f:
# pickle.dump(not_equal_list,f)

# print the distribution of the labels
total_clean_label_counter=Counter(total_labels)
total_clean_labels_dict=dict(total_clean_label_counter)

#save the dictionary as a json
with open(os.path.join(dest_folder,'movieCLIP_dataset_class_clean_distribution.json'),'w') as f:
json.dump(total_clean_labels_dict,f,indent=4)

#print(total_clean_label_counter)

#plot the distribution of the labels using a




#save the not present list
with open(os.path.join(dest_folder,'not_equal_list.pkl'),'wb') as f:
pickle.dump(not_equal_list,f)
#save the dictionary as a json
#print(movieCLIP_dict)
# with open(os.path.join(dest_folder,'movieCLIP_dataset.json'),'w') as f:
Expand Down
72 changes: 72 additions & 0 deletions split_files/DATASET.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# MovieCLIP Dataset


## Raw videos

* Download the original videos by requesting access to the [**Condensed Movies Dataset**](https://github.com/m-bain/CondensedMovies). Our video tagging was peformed on the videos present in **Condensed Movies** dataset. We do not own the raw videos.

## CLIP tags

* The complete list of CLIP tags for the shots in the MovieCLIP dataset can be downloaded from this [**Drive Link**](https://drive.google.com/file/d/15EhA0BT3IF0EuLP1yXr5nn5ad9soxxox/view?usp=share_link)

* Load the CLIP tags using the following code snippet:

```python
import json
with open('movieCLIP_dataset.json', 'r') as f:
movieCLIP_tags = json.load(f)
```
* **movieCLIP_tags** is a dictionary with keys as the video names (youtube ids in **Condensed Movies**) and values as a list of CLIP tags for each shot in the video:

```python
"qM8jk56Vj9Y":
"qM8jk56Vj9Y-Scene-018.mp4": {
"start_frame": 1059.0,
"end_frame": 1137.0,
"start_time": 44.169,
"end_time": 47.422,
"labels": {
"banquet": 0.7861328125,
"dining room": 0.07110595703125,
"restaurant": 0.028594970703125,
"penthouse": 0.01611328125,
"salon": 0.01186370849609375
}
},
"B-tq7mbTvrA":
"B-tq7mbTvrA-Scene-003.mp4": {
"start_frame": 54.0,
"end_frame": 74.0,
"start_time": 2.252,
"end_time": 3.086,
"labels": {
"batting cage": 0.479248046875,
"locker room": 0.160400390625,
"baseball field": 0.11248779296875,
"stadium": 0.0601806640625,
"bowling alley": 0.040496826171875
}
},
"Ld2g77JckSk":
"Ld2g77JckSk-Scene-018.mp4": {
"start_frame": 974.0,
"end_frame": 1031.0,
"start_time": 40.627,
"end_time": 43.004,
"labels": {
"animal shelter": 0.640625,
"zoo": 0.07684326171875,
"farm": 0.04071044921875,
"fair": 0.0256500244140625,
"suburban": 0.0123748779296875
}
}
```

## TODOS
* Check the distribution of the labels from the json if they look the same as previous version





0 comments on commit 459144e

Please sign in to comment.