Added updates to movieCLIP data parsing

usc-sail · Apr 3, 2023 · 459144e · 459144e
1 parent c7a6c29
commit 459144e
Show file tree

Hide file tree

Showing 3 changed files with 151 additions and 10 deletions.
diff --git a/preprocess_scripts/parse_movieCLIP_json.py b/preprocess_scripts/parse_movieCLIP_json.py
@@ -0,0 +1,46 @@
+import os 
+import json 
+import pandas as pd 
+import numpy as np 
+import argparse
+from tqdm import tqdm
+from collections import Counter
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--source_file', type=str, required=True)
+parser.add_argument('--destination_folder', type=str, required=True)
+
+
+args = parser.parse_args()
+source_file = args.source_file
+dest_folder = args.destination_folder
+
+#read the json file
+with open(source_file,'r') as f:
+    data=json.load(f)
+
+video_keys=list(data.keys())
+total_labels_list=[]
+
+for video_key in tqdm(list(video_keys)):
+
+    video_data=data[video_key]
+
+    for shot_name in list(video_data.keys()):
+        shot_data=video_data[shot_name]
+        labels=shot_data['labels']
+        shot_labels={l for l in list(labels.keys()) if labels[l]>=0.4}
+        total_labels_list=total_labels_list+list(shot_labels)
+
+        #print(shot_data['labels'])
+total_labels_counter=Counter(total_labels_list)
+total_labels_dict=dict(total_labels_counter)
+print(len(total_labels_dict))
+
+#save the total labels dict
+with open(os.path.join(dest_folder,'clean_labels_movieCLIP_distribution.json'),'w') as f:
+    json.dump(total_labels_dict,f,indent=4)
+
+
+
+
diff --git a/preprocess_scripts/preprocess_movieCLIP_json_format.py b/preprocess_scripts/preprocess_movieCLIP_json_format.py
@@ -4,6 +4,7 @@
 import argparse
 from tqdm import tqdm  
 import pickle
+from collections import Counter
 #json file with the keys as video id and sub keys as the Scene Number 
 #for each scene number there is a sub-dictionary with labels and scores, Start time and end time 
 
@@ -25,6 +26,9 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
     #assert(len(tag_data)==len(csv_data['Scene Number']))
     shot_level_dict={}
     num_clean_samples_video=0
+    total_set_labels=[]
+
+
     for i in range(len(csv_data['Scene Number'])):
 
         #scene number + start time + end time
@@ -47,6 +51,7 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
             if(len(scores_list)>0):
                 if(scores_list[0]>=0.4):
                     num_clean_samples_video+=1
+                    total_set_labels=total_set_labels+labels_list
 
             #create a dict with the labels_list and scores_list
             label_dict={ labels_list[i]:scores_list[i] for i in range(len(labels_list))}
@@ -61,7 +66,7 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
 
             shot_level_dict[key_name]=temp_dict
 
-    return(shot_level_dict,num_clean_samples_video)
+    return(shot_level_dict,num_clean_samples_video,total_set_labels)
 
 parser=argparse.ArgumentParser()
 parser.add_argument('--destination_folder', type=str, required=True)
@@ -102,6 +107,7 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
 total_tagged_samples=0
 movieCLIP_dict={}
 num_samples=0
+total_labels=[]
 
 for file in tqdm(total_segment_files):
 
@@ -130,11 +136,12 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
         else:
 
             #generate_video_key_wise_dictionary(tag_file_name,shot_csv_data,clip_tag_data)
-            shot_level_dict,total_clean_samples_per_video=generate_video_key_wise_dictionary(tag_file_name,shot_csv_data,clip_tag_data)
+            shot_level_dict,total_clean_samples_per_video,total_clean_labels_list=generate_video_key_wise_dictionary(tag_file_name,shot_csv_data,clip_tag_data)
             total_clean_samples_video+=total_clean_samples_per_video
-            #print(tag_file_name)
+            total_labels=total_labels+total_clean_labels_list #total clean labels list
+
             movieCLIP_dict[tag_file_name]=shot_level_dict
-            #print(movieCLIP_dict[tag_file_name])
+
             total_tagged_samples+=len(shot_level_dict)
             equal_files+=1
             num_samples+=1
@@ -151,13 +158,29 @@ def generate_video_key_wise_dictionary(file_key,csv_data,tag_data):
 print("Total tagged samples: ",total_tagged_samples) #0
 print("Total clean samples: ",total_clean_samples_video) #0
 
-#save the empty tagged files list
-with open(os.path.join(dest_folder,'empty_tagged_file_list.pkl'),'wb') as f:
-    pickle.dump(empty_tagged_file_list,f)
+# #save the empty tagged files list
+# with open(os.path.join(dest_folder,'empty_tagged_file_list.pkl'),'wb') as f:
+#     pickle.dump(empty_tagged_file_list,f)
+
+# #save the not present list
+# with open(os.path.join(dest_folder,'not_equal_list.pkl'),'wb') as f:
+#     pickle.dump(not_equal_list,f)
+
+# print the distribution of the labels
+total_clean_label_counter=Counter(total_labels)
+total_clean_labels_dict=dict(total_clean_label_counter)
+
+#save the dictionary as a json
+with open(os.path.join(dest_folder,'movieCLIP_dataset_class_clean_distribution.json'),'w') as f:
+    json.dump(total_clean_labels_dict,f,indent=4)
+
+#print(total_clean_label_counter)
+
+#plot the distribution of the labels using a
+
+
+
 
-#save the not present list
-with open(os.path.join(dest_folder,'not_equal_list.pkl'),'wb') as f:
-    pickle.dump(not_equal_list,f)
 #save the dictionary as a json
 #print(movieCLIP_dict)
 # with open(os.path.join(dest_folder,'movieCLIP_dataset.json'),'w') as f:

diff --git a/split_files/DATASET.md b/split_files/DATASET.md
@@ -0,0 +1,72 @@
+# MovieCLIP Dataset
+
+
+## Raw videos 
+
+* Download the original videos by requesting access to the [**Condensed Movies Dataset**](https://github.com/m-bain/CondensedMovies). Our video tagging was peformed on the videos present in **Condensed Movies** dataset. We do not own the raw videos.
+
+## CLIP tags 
+
+* The complete list of CLIP tags for the shots in the MovieCLIP dataset can be downloaded from this [**Drive Link**](https://drive.google.com/file/d/15EhA0BT3IF0EuLP1yXr5nn5ad9soxxox/view?usp=share_link)
+
+* Load the CLIP tags using the following code snippet:
+
+    ```python
+    import json
+    with open('movieCLIP_dataset.json', 'r') as f:
+        movieCLIP_tags = json.load(f)
+    ```
+* **movieCLIP_tags** is a dictionary with keys as the video names (youtube ids in **Condensed Movies**) and values as a list of CLIP tags for each shot in the video:
+
+    ```python
+    "qM8jk56Vj9Y":
+        "qM8jk56Vj9Y-Scene-018.mp4": {
+                "start_frame": 1059.0,
+                "end_frame": 1137.0,
+                "start_time": 44.169,
+                "end_time": 47.422,
+                "labels": {
+                    "banquet": 0.7861328125,
+                    "dining room": 0.07110595703125,
+                    "restaurant": 0.028594970703125,
+                    "penthouse": 0.01611328125,
+                    "salon": 0.01186370849609375
+                }
+            },
+    "B-tq7mbTvrA":
+        "B-tq7mbTvrA-Scene-003.mp4": {
+            "start_frame": 54.0,
+            "end_frame": 74.0,
+            "start_time": 2.252,
+            "end_time": 3.086,
+            "labels": {
+                "batting cage": 0.479248046875,
+                "locker room": 0.160400390625,
+                "baseball field": 0.11248779296875,
+                "stadium": 0.0601806640625,
+                "bowling alley": 0.040496826171875
+            }
+        },
+    "Ld2g77JckSk":
+        "Ld2g77JckSk-Scene-018.mp4": {
+            "start_frame": 974.0,
+            "end_frame": 1031.0,
+            "start_time": 40.627,
+            "end_time": 43.004,
+            "labels": {
+                "animal shelter": 0.640625,
+                "zoo": 0.07684326171875,
+                "farm": 0.04071044921875,
+                "fair": 0.0256500244140625,
+                "suburban": 0.0123748779296875
+            }
+        }
+    ```
+
+    ## TODOS
+        * Check the distribution of the labels from the json if they look the same as previous version
+
+
+
+
+