diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py index 84494a158ef..53b68d430fa 100644 --- a/src/main/python/systemds/scuro/__init__.py +++ b/src/main/python/systemds/scuro/__init__.py @@ -18,59 +18,60 @@ # under the License. # # ------------------------------------------------------------- +from systemds.scuro.dataloader.base_loader import BaseLoader +from systemds.scuro.dataloader.audio_loader import AudioLoader +from systemds.scuro.dataloader.video_loader import VideoLoader +from systemds.scuro.dataloader.text_loader import TextLoader +from systemds.scuro.dataloader.json_loader import JSONLoader from systemds.scuro.representations.representation import Representation from systemds.scuro.representations.average import Average from systemds.scuro.representations.concatenation import Concatenation -from systemds.scuro.representations.fusion import Fusion from systemds.scuro.representations.sum import Sum from systemds.scuro.representations.max import RowMax from systemds.scuro.representations.multiplication import Multiplication from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.resnet import ResNet from systemds.scuro.representations.bert import Bert -from systemds.scuro.representations.unimodal import UnimodalRepresentation from systemds.scuro.representations.lstm import LSTM -from systemds.scuro.representations.representation_dataloader import ( - NPY, - Pickle, - HDF5, - JSON, -) +from systemds.scuro.representations.bow import BoW +from systemds.scuro.representations.glove import GloVe +from systemds.scuro.representations.tfidf import TfIdf +from systemds.scuro.representations.word2vec import W2V from systemds.scuro.models.model import Model from systemds.scuro.models.discrete_model import DiscreteModel -from systemds.scuro.modality.aligned_modality import AlignedModality -from systemds.scuro.modality.audio_modality import AudioModality -from systemds.scuro.modality.video_modality import VideoModality -from systemds.scuro.modality.text_modality import TextModality from systemds.scuro.modality.modality import Modality +from systemds.scuro.modality.unimodal_modality import UnimodalModality +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.modality.type import ModalityType from systemds.scuro.aligner.dr_search import DRSearch from systemds.scuro.aligner.task import Task __all__ = [ + "BaseLoader", + "AudioLoader", + "VideoLoader", + "TextLoader", "Representation", "Average", "Concatenation", - "Fusion", "Sum", "RowMax", "Multiplication", "MelSpectrogram", "ResNet", "Bert", - "UnimodalRepresentation", "LSTM", - "NPY", - "Pickle", - "HDF5", - "JSON", + "BoW", + "GloVe", + "TfIdf", + "W2V", "Model", "DiscreteModel", - "AlignedModality", - "AudioModality", - "VideoModality", - "TextModality", "Modality", + "UnimodalModality", + "TransformedModality", + "ModalityType", "DRSearch", "Task", ] diff --git a/src/main/python/systemds/scuro/aligner/alignment.py b/src/main/python/systemds/scuro/aligner/alignment.py index e341e1b76bf..62f88a272b9 100644 --- a/src/main/python/systemds/scuro/aligner/alignment.py +++ b/src/main/python/systemds/scuro/aligner/alignment.py @@ -19,7 +19,6 @@ # # ------------------------------------------------------------- from aligner.alignment_strategy import AlignmentStrategy -from modality.aligned_modality import AlignedModality from modality.modality import Modality from modality.representation import Representation from aligner.similarity_measures import Measure @@ -46,4 +45,4 @@ def __init__( self.similarity_measure = similarity_measure def align_modalities(self) -> Modality: - return AlignedModality(Representation()) + return Modality(Representation()) diff --git a/src/main/python/systemds/scuro/aligner/dr_search.py b/src/main/python/systemds/scuro/aligner/dr_search.py index 24f3c3236f5..b46139dff30 100644 --- a/src/main/python/systemds/scuro/aligner/dr_search.py +++ b/src/main/python/systemds/scuro/aligner/dr_search.py @@ -23,7 +23,6 @@ from typing import List from systemds.scuro.aligner.task import Task -from systemds.scuro.modality.aligned_modality import AlignedModality from systemds.scuro.modality.modality import Modality from systemds.scuro.representations.representation import Representation @@ -64,27 +63,25 @@ def __init__( def set_best_params( self, - modality_name: str, representation: Representation, scores: List[float], modality_names: List[str], ): """ Updates the best parameters for given modalities, representation, and score - :param modality_name: The name of the aligned modality :param representation: The representation used to retrieve the current score - :param score: achieved score for the set of modalities and representation + :param scores: achieved train/test scores for the set of modalities and representation :param modality_names: List of modality names used in this setting :return: """ # check if modality name is already in dictionary - if modality_name not in self.scores.keys(): + if "_".join(modality_names) not in self.scores.keys(): # if not add it to dictionary - self.scores[modality_name] = {} + self.scores["_".join(modality_names)] = {} # set score for representation - self.scores[modality_name][representation] = scores + self.scores["_".join(modality_names)][representation] = scores # compare current score with best score if scores[1] > self.best_score: @@ -113,13 +110,12 @@ def fit_random(self, seed=-1): modality_combination = random.choice(modalities) representation = random.choice(self.representations) - modality = AlignedModality(representation, list(modality_combination)) # noqa - modality.combine() + modality = modality_combination[0].combine( + modality_combination[1:], representation + ) scores = self.task.run(modality.data) - self.set_best_params( - modality.name, representation, scores, modality.get_modality_names() - ) + self.set_best_params(representation, scores, modality.get_modality_names()) return self.best_representation, self.best_score, self.best_modalities @@ -133,14 +129,14 @@ def fit_enumerate_all(self): for M in range(1, len(self.modalities) + 1): for combination in itertools.combinations(self.modalities, M): for representation in self.representations: - modality = AlignedModality( - representation, list(combination) - ) # noqa - modality.combine() + modality = combination[0] + if len(combination) > 1: + modality = combination[0].combine( + list(combination[1:]), representation + ) scores = self.task.run(modality.data) self.set_best_params( - modality.name, representation, scores, modality.get_modality_names(), @@ -164,7 +160,8 @@ def transform(self, modalities: List[Modality]): for modality_name in self.best_modalities: used_modalities.append(get_modalities_by_name(modalities, modality_name)) - modality = AlignedModality(self.best_representation, used_modalities) # noqa - modality.combine(self.task.train_indices) + modality = used_modalities[0].combine( + used_modalities[1:], self.best_representation + ) return modality.data diff --git a/src/main/python/systemds/scuro/dataloader/__init__.py b/src/main/python/systemds/scuro/dataloader/__init__.py new file mode 100644 index 00000000000..e66abb4646f --- /dev/null +++ b/src/main/python/systemds/scuro/dataloader/__init__.py @@ -0,0 +1,20 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py new file mode 100644 index 00000000000..f85b1b80faa --- /dev/null +++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py @@ -0,0 +1,39 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from typing import List, Optional + +import librosa +from systemds.scuro.dataloader.base_loader import BaseLoader + + +class AudioLoader(BaseLoader): + def __init__( + self, + source_path: str, + indices: List[str], + chunk_size: Optional[int] = None, + ): + super().__init__(source_path, indices, chunk_size) + + def extract(self, file: str): + self.file_sanity_check(file) + audio, sr = librosa.load(file) + self.data.append(audio) diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py new file mode 100644 index 00000000000..2ef60677c67 --- /dev/null +++ b/src/main/python/systemds/scuro/dataloader/base_loader.py @@ -0,0 +1,92 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import os +from abc import ABC, abstractmethod +from typing import List, Optional, Union + + +class BaseLoader(ABC): + def __init__( + self, source_path: str, indices: List[str], chunk_size: Optional[int] = None + ): + """ + Base class to load raw data for a given list of indices and stores them in the data object + :param source_path: The location where the raw data lies + :param indices: A list of indices as strings that are corresponding to the file names + :param chunk_size: An optional argument to load the data in chunks instead of all at once + (otherwise please provide your own Dataloader that knows about the file name convention) + """ + self.data = [] + self.source_path = source_path + self.indices = indices + self.chunk_size = chunk_size + self.next_chunk = 0 + + if self.chunk_size: + self.num_chunks = int(len(self.indices) / self.chunk_size) + + def load(self): + """ + Takes care of loading the raw data either chunk wise (if chunk size is defined) or all at once + """ + if self.chunk_size: + return self._load_next_chunk() + + return self._load(self.indices) + + def _load_next_chunk(self): + """ + Loads the next chunk of data + """ + self.data = [] + next_chunk_indices = self.indices[ + self.next_chunk * self.chunk_size : (self.next_chunk + 1) * self.chunk_size + ] + self.next_chunk += 1 + return self._load(next_chunk_indices) + + def _load(self, indices: List[str]): + is_dir = True if os.path.isdir(self.source_path) else False + + if is_dir: + _, ext = os.path.splitext(os.listdir(self.source_path)[0]) + for index in indices: + self.extract(self.source_path + index + ext) + else: + self.extract(self.source_path, indices) + + return self.data + + @abstractmethod + def extract(self, file: str, index: Optional[Union[str, List[str]]] = None): + pass + + def file_sanity_check(self, file): + """ + Checks if the file can be found is not empty + """ + try: + file_size = os.path.getsize(file) + except: + raise (f"Error: File {0} not found!".format(file)) + + if file_size == 0: + raise ("File {0} is empty".format(file)) diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py new file mode 100644 index 00000000000..c4e3b956111 --- /dev/null +++ b/src/main/python/systemds/scuro/dataloader/json_loader.py @@ -0,0 +1,43 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +import json + +from systemds.scuro.dataloader.base_loader import BaseLoader +from typing import Optional, List + + +class JSONLoader(BaseLoader): + def __init__( + self, + source_path: str, + indices: List[str], + field: str, + chunk_size: Optional[int] = None, + ): + super().__init__(source_path, indices, chunk_size) + self.field = field + + def extract(self, file: str, indices: List[str]): + self.file_sanity_check(file) + with open(file) as f: + json_file = json.load(f) + for idx in indices: + self.data.append(json_file[idx][self.field]) diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py new file mode 100644 index 00000000000..f614472bce6 --- /dev/null +++ b/src/main/python/systemds/scuro/dataloader/text_loader.py @@ -0,0 +1,44 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from systemds.scuro.dataloader.base_loader import BaseLoader +from typing import Optional, Pattern, List +import re + + +class TextLoader(BaseLoader): + def __init__( + self, + source_path: str, + indices: List[str], + chunk_size: Optional[int] = None, + prefix: Optional[Pattern[str]] = None, + ): + super().__init__(source_path, indices, chunk_size) + self.prefix = prefix + + def extract(self, file: str): + self.file_sanity_check(file) + with open(file) as text_file: + for i, line in enumerate(text_file): + if self.prefix: + line = re.sub(self.prefix, "", line) + line = line.replace("\n", "") + self.data.append(line) diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py new file mode 100644 index 00000000000..6da20b34756 --- /dev/null +++ b/src/main/python/systemds/scuro/dataloader/video_loader.py @@ -0,0 +1,52 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from typing import List, Optional + +import numpy as np + +from systemds.scuro.dataloader.base_loader import BaseLoader +import cv2 + + +class VideoLoader(BaseLoader): + def __init__( + self, + source_path: str, + indices: List[str], + chunk_size: Optional[int] = None, + ): + super().__init__(source_path, indices, chunk_size) + + def extract(self, file: str): + self.file_sanity_check(file) + cap = cv2.VideoCapture(file) + frames = [] + while cap.isOpened(): + ret, frame = cap.read() + + if not ret: + break + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + frame = frame.astype(np.float32) / 255.0 + + frames.append(frame) + + self.data.append(frames) diff --git a/src/main/python/systemds/scuro/main.py b/src/main/python/systemds/scuro/main.py index f28b271b979..8a51e098cc5 100644 --- a/src/main/python/systemds/scuro/main.py +++ b/src/main/python/systemds/scuro/main.py @@ -18,21 +18,20 @@ # under the License. # # ------------------------------------------------------------- -import collections -import json -from datetime import datetime - +from systemds.scuro.representations.bert import Bert +from systemds.scuro.representations.resnet import ResNet +from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.average import Average from systemds.scuro.representations.concatenation import Concatenation -from systemds.scuro.modality.aligned_modality import AlignedModality -from systemds.scuro.modality.text_modality import TextModality -from systemds.scuro.modality.video_modality import VideoModality -from systemds.scuro.modality.audio_modality import AudioModality -from systemds.scuro.representations.unimodal import Pickle, JSON, HDF5, NPY +from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.models.discrete_model import DiscreteModel from systemds.scuro.aligner.task import Task from systemds.scuro.aligner.dr_search import DRSearch +from systemds.scuro.dataloader.audio_loader import AudioLoader +from systemds.scuro.dataloader.text_loader import TextLoader +from systemds.scuro.dataloader.video_loader import VideoLoader + class CustomTask(Task): def __init__(self, model, labels, train_indices, val_indices): @@ -49,18 +48,32 @@ def run(self, data): train_indices = [] val_indices = [] +all_indices = [] + video_path = "" audio_path = "" text_path = "" + +# Define dataloaders +video_data_loader = VideoLoader(video_path, all_indices, chunk_size=10) +text_data_loader = TextLoader(text_path, all_indices) +audio_data_loader = AudioLoader(audio_path, all_indices) + # Load modalities (audio, video, text) -video = VideoModality(video_path, HDF5(), train_indices) -audio = AudioModality(audio_path, Pickle(), train_indices) -text = TextModality(text_path, NPY(), train_indices) +video = UnimodalModality(video_data_loader, "VIDEO") +audio = UnimodalModality(audio_data_loader, "AUDIO") +text = UnimodalModality(text_data_loader, "TEXT") + +# Define unimodal representations +r_v = ResNet() +r_a = MelSpectrogram() +r_t = Bert() -video.read_all() -audio.read_all() -text.read_all() +# Transform raw unimodal data +video.apply_representation(r_v) +audio.apply_representation(r_a) +text.apply_representation(r_t) modalities = [text, audio, video] diff --git a/src/main/python/systemds/scuro/modality/aligned_modality.py b/src/main/python/systemds/scuro/modality/aligned_modality.py deleted file mode 100644 index 839b9d296f8..00000000000 --- a/src/main/python/systemds/scuro/modality/aligned_modality.py +++ /dev/null @@ -1,51 +0,0 @@ -# ------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# ------------------------------------------------------------- -from typing import List - -from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.fusion import Fusion - - -class AlignedModality(Modality): - def __init__(self, representation: Fusion, modalities: List[Modality]): - """ - Defines the modality that is created during the fusion process - :param representation: The representation for the aligned modality - :param modalities: List of modalities to be combined - """ - name = "" - for modality in modalities: - name += modality.name - super().__init__(representation, modality_name=name) - self.modalities = modalities - - def combine(self): - """ - Initiates the call to fuse the given modalities depending on the Fusion type - """ - self.data = self.representation.fuse(self.modalities) # noqa - - def get_modality_names(self): - names = [] - for modality in self.modalities: - names.append(modality.name) - - return names diff --git a/src/main/python/systemds/scuro/modality/audio_modality.py b/src/main/python/systemds/scuro/modality/audio_modality.py deleted file mode 100644 index ba849622269..00000000000 --- a/src/main/python/systemds/scuro/modality/audio_modality.py +++ /dev/null @@ -1,61 +0,0 @@ -# ------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# ------------------------------------------------------------- -import os - -from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.unimodal import UnimodalRepresentation - - -class AudioModality(Modality): - def __init__( - self, - file_path: str, - representation: UnimodalRepresentation, - train_indices=None, - start_index: int = 0, - ): - """ - Creates an audio modality - :param file_path: path to file where the audio embeddings are stored - :param representation: Unimodal representation that indicates how to extract the data from the file - """ - super().__init__(representation, start_index, "Audio", train_indices) - self.file_path = file_path - - def file_sanity_check(self): - """ - Checks if the file can be found is not empty - """ - try: - file_size = os.path.getsize(self.file_path) - except: - raise (f"Error: File {0} not found!".format(self.file_path)) - - if file_size == 0: - raise ("File {0} is empty".format(self.file_path)) - - def read_chunk(self): - pass - - def read_all(self, indices=None): - self.data = self.representation.parse_all( - self.file_path, indices=indices - ) # noqa diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py index a899576d5b8..9a3d1b148d2 100644 --- a/src/main/python/systemds/scuro/modality/modality.py +++ b/src/main/python/systemds/scuro/modality/modality.py @@ -18,41 +18,27 @@ # under the License. # # ------------------------------------------------------------- +from typing import List -from systemds.scuro.representations.representation import Representation +from systemds.scuro.modality.type import ModalityType class Modality: - def __init__( - self, - representation: Representation, - start_index: int = 0, - modality_name="", - train_indices=None, - ): + def __init__(self, modality_type: ModalityType): """ - Parent class of the different Modalities - :param representation: Specifies how the data should be represented for a specific modality - :param start_index: Defines the first index used for the alignment - :param modality_name: Name of the modality - :param train_indices: List of indices used for train-test split + Parent class of the different Modalities (unimodal & multimodal) + :param modality_type: Type of the modality """ - self.representation = representation - self.start_index = start_index - self.name = modality_name + self.type = modality_type self.data = None - self.train_indices = train_indices + self.data_type = None + self.cost = None + self.shape = None + self.schema = {} - def read_chunk(self): + def get_modality_names(self) -> List[str]: """ - Extracts a data chunk of the modality according to the window size specified in params + Extracts the individual unimodal modalities for a given transformed modality. """ - raise NotImplementedError - - def read_all(self, indices): - """ - Implemented for every unique modality to read all samples from a specified format - :param indices: List of indices to be read - """ - pass + return [modality.name for modality in ModalityType if modality in self.type] diff --git a/src/main/python/systemds/scuro/modality/text_modality.py b/src/main/python/systemds/scuro/modality/text_modality.py deleted file mode 100644 index c636de71672..00000000000 --- a/src/main/python/systemds/scuro/modality/text_modality.py +++ /dev/null @@ -1,61 +0,0 @@ -# ------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# ------------------------------------------------------------- -import os - -from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.unimodal import UnimodalRepresentation - - -class TextModality(Modality): - def __init__( - self, - file_path: str, - representation: UnimodalRepresentation, - train_indices=None, - start_index: int = 0, - ): - """ - Creates a text modality - :param file_path: path to file(s) where the text data is stored - :param representation: Unimodal representation that indicates how to extract the data from the file - """ - super().__init__(representation, start_index, "Text", train_indices) - self.file_path = file_path - - def file_sanity_check(self): - """ - Checks if the file can be found is not empty - """ - try: - file_size = os.path.getsize(self.file_path) - except: - raise (f"Error: File {0} not found!".format(self.file_path)) - - if file_size == 0: - raise ("File {0} is empty".format(self.file_path)) - - def read_chunk(self): - pass - - def read_all(self, indices=None): - self.data = self.representation.parse_all( - self.file_path, indices=indices - ) # noqa diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py new file mode 100644 index 00000000000..61c327e469e --- /dev/null +++ b/src/main/python/systemds/scuro/modality/transformed.py @@ -0,0 +1,52 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from functools import reduce +from operator import or_ + +from systemds.scuro.modality.modality import Modality +from systemds.scuro.modality.type import ModalityType + + +class TransformedModality(Modality): + + def __init__(self, modality_type: ModalityType, transformation): + """ + Parent class of the different Modalities (unimodal & multimodal) + :param modality_type: Type of the original modality(ies) + :param transformation: Representation to be applied on the modality + """ + super().__init__(modality_type) + self.transformation = transformation + + def combine(self, other, fusion_method): + """ + Combines two or more modalities with each other using a dedicated fusion method + :param other: The modality to be combined + :param fusion_method: The fusion method to be used to combine modalities + """ + fused_modality = TransformedModality( + reduce(or_, (o.type for o in other), self.type), fusion_method + ) + modalities = [self] + modalities.extend(other) + fused_modality.data = fusion_method.transform(modalities) + + return fused_modality diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py new file mode 100644 index 00000000000..c451eea6f1d --- /dev/null +++ b/src/main/python/systemds/scuro/modality/type.py @@ -0,0 +1,31 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from enum import Enum, Flag, auto + + +class ModalityType(Flag): + TEXT = auto() + AUDIO = auto() + VIDEO = auto() + + # def __init__(self, value, name): + # self._value_ = value + # self.name = name diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py new file mode 100644 index 00000000000..976d4194d47 --- /dev/null +++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py @@ -0,0 +1,59 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from systemds.scuro.dataloader.base_loader import BaseLoader +from systemds.scuro.modality.modality import Modality +from systemds.scuro.modality.transformed import TransformedModality +from systemds.scuro.modality.type import ModalityType + + +class UnimodalModality(Modality): + + def __init__(self, data_loader: BaseLoader, modality_type: ModalityType): + """ + This class represents a unimodal modality. + :param data_loader: Defines how the raw data should be loaded + :param modality_type: Type of the modality + """ + super().__init__(modality_type) + self.data_loader = data_loader + + def extract_raw_data(self): + """ + Uses the data loader to read the raw data from a specified location + and stores the data in the data location. + TODO: schema + """ + self.data = self.data_loader.load() + + def apply_representation(self, representation): + new_modality = TransformedModality(self.type, representation) + new_modality.data = [] + + if self.data_loader.chunk_size: + while self.data_loader.next_chunk < self.data_loader.num_chunks: + self.extract_raw_data() + new_modality.data.extend(representation.transform(self.data)) + else: + if not self.data: + self.extract_raw_data() + new_modality.data = representation.transform(self.data) + + return new_modality diff --git a/src/main/python/systemds/scuro/modality/video_modality.py b/src/main/python/systemds/scuro/modality/video_modality.py deleted file mode 100644 index a6cedf6c86a..00000000000 --- a/src/main/python/systemds/scuro/modality/video_modality.py +++ /dev/null @@ -1,61 +0,0 @@ -# ------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -# ------------------------------------------------------------- -import os - -from systemds.scuro.modality.modality import Modality -from systemds.scuro.representations.unimodal import UnimodalRepresentation - - -class VideoModality(Modality): - def __init__( - self, - file_path: str, - representation: UnimodalRepresentation, - train_indices=None, - start_index: int = 0, - ): - """ - Creates a video modality - :param file_path: path to file where the video embeddings (for now) are stored - :param representation: Unimodal representation that indicates how to extract the data from the file - """ - super().__init__(representation, start_index, "Video", train_indices) - self.file_path = file_path - - def file_sanity_check(self): - """ - Checks if the file can be found is not empty - """ - try: - file_size = os.path.getsize(self.file_path) - except: - raise (f"Error: File {0} not found!".format(self.file_path)) - - if file_size == 0: - raise ("File {0} is empty".format(self.file_path)) - - def read_chunk(self): - pass - - def read_all(self, indices=None): - self.data = self.representation.parse_all( - self.file_path, indices=indices - ) # noqa diff --git a/src/main/python/systemds/scuro/representations/average.py b/src/main/python/systemds/scuro/representations/average.py index d10778f1136..db44050e9e0 100644 --- a/src/main/python/systemds/scuro/representations/average.py +++ b/src/main/python/systemds/scuro/representations/average.py @@ -36,7 +36,7 @@ def __init__(self): """ super().__init__("Average") - def fuse(self, modalities: List[Modality]): + def transform(self, modalities: List[Modality]): max_emb_size = self.get_max_embedding_size(modalities) padded_modalities = [] diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py index 85d0b1ad658..0fcf1e8d280 100644 --- a/src/main/python/systemds/scuro/representations/bert.py +++ b/src/main/python/systemds/scuro/representations/bert.py @@ -24,7 +24,7 @@ from systemds.scuro.representations.unimodal import UnimodalRepresentation import torch from transformers import BertTokenizer, BertModel -from systemds.scuro.representations.utils import read_data_from_file, save_embeddings +from systemds.scuro.representations.utils import save_embeddings class Bert(UnimodalRepresentation): @@ -34,8 +34,7 @@ def __init__(self, avg_layers=None, output_file=None): self.avg_layers = avg_layers self.output_file = output_file - def parse_all(self, filepath, indices): - data = read_data_from_file(filepath, indices) + def transform(self, data): model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained( @@ -47,13 +46,10 @@ def parse_all(self, filepath, indices): else: model = BertModel.from_pretrained(model_name) - embeddings = self.create_embeddings(list(data.values()), model, tokenizer) + embeddings = self.create_embeddings(data, model, tokenizer) if self.output_file is not None: - data = {} - for i in range(0, embeddings.shape[0]): - data[indices[i]] = embeddings[i] - save_embeddings(data, self.output_file) + save_embeddings(embeddings, self.output_file) return embeddings @@ -75,8 +71,5 @@ def create_embeddings(self, data, model, tokenizer): cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy() embeddings.append(cls_embedding) - if self.output_file is not None: - save_embeddings(embeddings, self.output_file) - embeddings = np.array(embeddings) return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1])) diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py index dc5013b3540..bd54654a5cb 100644 --- a/src/main/python/systemds/scuro/representations/bow.py +++ b/src/main/python/systemds/scuro/representations/bow.py @@ -19,11 +19,10 @@ # # ------------------------------------------------------------- -import pandas as pd from sklearn.feature_extraction.text import CountVectorizer from systemds.scuro.representations.unimodal import UnimodalRepresentation -from systemds.scuro.representations.utils import read_data_from_file, save_embeddings +from systemds.scuro.representations.utils import save_embeddings class BoW(UnimodalRepresentation): @@ -33,19 +32,14 @@ def __init__(self, ngram_range, min_df, output_file=None): self.min_df = min_df self.output_file = output_file - def parse_all(self, filepath, indices): + def transform(self, data): vectorizer = CountVectorizer( ngram_range=(1, self.ngram_range), min_df=self.min_df ) - segments = read_data_from_file(filepath, indices) - X = vectorizer.fit_transform(segments.values()) - X = X.toarray() + X = vectorizer.fit_transform(data).toarray() if self.output_file is not None: - df = pd.DataFrame(X) - df.index = segments.keys() - - save_embeddings(df, self.output_file) + save_embeddings(X, self.output_file) return X diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py index 7694fa68977..fd9293d3997 100644 --- a/src/main/python/systemds/scuro/representations/concatenation.py +++ b/src/main/python/systemds/scuro/representations/concatenation.py @@ -37,7 +37,7 @@ def __init__(self, padding=True): super().__init__("Concatenation") self.padding = padding - def fuse(self, modalities: List[Modality]): + def transform(self, modalities: List[Modality]): if len(modalities) == 1: return np.array(modalities[0].data) diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py index e84e59f6663..623979dd052 100644 --- a/src/main/python/systemds/scuro/representations/fusion.py +++ b/src/main/python/systemds/scuro/representations/fusion.py @@ -32,7 +32,7 @@ def __init__(self, name): """ super().__init__(name) - def fuse(self, modalities: List[Modality]): + def transform(self, modalities: List[Modality]): """ Implemented for every child class and creates a fused representation out of multiple modalities diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py index 840360540e5..cf13c717d2f 100644 --- a/src/main/python/systemds/scuro/representations/glove.py +++ b/src/main/python/systemds/scuro/representations/glove.py @@ -18,7 +18,6 @@ # under the License. # # ------------------------------------------------------------- -import nltk import numpy as np from nltk import word_tokenize @@ -43,24 +42,24 @@ def __init__(self, glove_path, output_file=None): self.glove_path = glove_path self.output_file = output_file - def parse_all(self, filepath, indices): + def transform(self, data): glove_embeddings = load_glove_embeddings(self.glove_path) - segments = read_data_from_file(filepath, indices) - embeddings = {} - for k, v in segments.items(): - tokens = word_tokenize(v.lower()) - embeddings[k] = np.mean( - [ - glove_embeddings[token] - for token in tokens - if token in glove_embeddings - ], - axis=0, + embeddings = [] + for sentences in data: + tokens = word_tokenize(sentences.lower()) + embeddings.append( + np.mean( + [ + glove_embeddings[token] + for token in tokens + if token in glove_embeddings + ], + axis=0, + ) ) if self.output_file is not None: - save_embeddings(embeddings, self.output_file) + save_embeddings(np.array(embeddings), self.output_file) - embeddings = np.array(list(embeddings.values())) - return embeddings + return np.array(embeddings) diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py index 3687ff65147..649b81117b2 100644 --- a/src/main/python/systemds/scuro/representations/lstm.py +++ b/src/main/python/systemds/scuro/representations/lstm.py @@ -40,17 +40,17 @@ def __init__(self, width=128, depth=1, dropout_rate=0.1): self.dropout_rate = dropout_rate self.unimodal_embeddings = {} - def fuse(self, modalities: List[Modality], train_indices=None): + def transform(self, modalities: List[Modality]): size = len(modalities[0].data) result = np.zeros((size, 0)) for modality in modalities: - if modality.name in self.unimodal_embeddings.keys(): - out = self.unimodal_embeddings.get(modality.name) + if modality.type in self.unimodal_embeddings.keys(): + out = self.unimodal_embeddings.get(modality.type) else: out = self.run_lstm(modality.data) - self.unimodal_embeddings[modality.name] = out + self.unimodal_embeddings[modality.type] = out result = np.concatenate([result, out], axis=-1) diff --git a/src/main/python/systemds/scuro/representations/max.py b/src/main/python/systemds/scuro/representations/max.py index 2e1e8644773..194b20801e0 100644 --- a/src/main/python/systemds/scuro/representations/max.py +++ b/src/main/python/systemds/scuro/representations/max.py @@ -38,7 +38,7 @@ def __init__(self, split=4): super().__init__("RowMax") self.split = split - def fuse( + def transform( self, modalities: List[Modality], ): diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py index 27aba8b997d..57a7fab83e2 100644 --- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py +++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py @@ -19,7 +19,6 @@ # # ------------------------------------------------------------- -import os import pickle import librosa @@ -35,19 +34,15 @@ def __init__(self, avg=True, output_file=None): self.avg = avg self.output_file = output_file - def parse_all(self, file_path, indices, get_sequences=False): + def transform(self, data): result = [] max_length = 0 - if os.path.isdir(file_path): - for filename in os.listdir(file_path): - f = os.path.join(file_path, filename) - if os.path.isfile(f): - y, sr = librosa.load(f) - S = librosa.feature.melspectrogram(y=y, sr=sr) - S_dB = librosa.power_to_db(S, ref=np.max) - if S_dB.shape[-1] > max_length: - max_length = S_dB.shape[-1] - result.append(S_dB) + for sample in data: + S = librosa.feature.melspectrogram(y=sample) + S_dB = librosa.power_to_db(S, ref=np.max) + if S_dB.shape[-1] > max_length: + max_length = S_dB.shape[-1] + result.append(S_dB) r = [] for elem in result: @@ -57,9 +52,9 @@ def parse_all(self, file_path, indices, get_sequences=False): np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1) if self.output_file is not None: - data = {} + data = [] for i in range(0, np_array_r.shape[0]): - data[indices[i]] = np_array_r[i] + data.append(np_array_r[i]) with open(self.output_file, "wb") as file: pickle.dump(data, file) diff --git a/src/main/python/systemds/scuro/representations/multiplication.py b/src/main/python/systemds/scuro/representations/multiplication.py index 18f34bae6f9..2934fe5b3c9 100644 --- a/src/main/python/systemds/scuro/representations/multiplication.py +++ b/src/main/python/systemds/scuro/representations/multiplication.py @@ -36,7 +36,7 @@ def __init__(self): """ super().__init__("Multiplication") - def fuse(self, modalities: List[Modality], train_indices=None): + def transform(self, modalities: List[Modality], train_indices=None): max_emb_size = self.get_max_embedding_size(modalities) data = pad_sequences(modalities[0].data, maxlen=max_emb_size, dtype="float32") diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py index 75c921184b3..1c1bfa1d5ec 100644 --- a/src/main/python/systemds/scuro/representations/resnet.py +++ b/src/main/python/systemds/scuro/representations/resnet.py @@ -25,8 +25,6 @@ from systemds.scuro.representations.unimodal import UnimodalRepresentation from typing import Callable, Dict, Tuple, Any import torch.utils.data -import os -import cv2 import torch import torchvision.models as models import torchvision.transforms as transforms @@ -36,22 +34,25 @@ class ResNet(UnimodalRepresentation): - def __init__(self, output_file=None): + def __init__(self, layer="avgpool", output_file=None): super().__init__("ResNet") self.output_file = output_file + self.layer_name = layer - def parse_all(self, file_path, indices, get_sequences=False): - resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT) + def transform(self, data): + + resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE) resnet.eval() for param in resnet.parameters(): param.requires_grad = False - transform = transforms.Compose( + t = transforms.Compose( [ transforms.ToPILImage(), - transforms.Resize((224, 224)), + transforms.Resize(256), + transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] @@ -59,7 +60,7 @@ def parse_all(self, file_path, indices, get_sequences=False): ] ) - dataset = ResNetDataset(transform=transform, video_folder_path=file_path) + dataset = ResNetDataset(data, t) embeddings = {} class Identity(torch.nn.Module): @@ -70,108 +71,72 @@ def forward(self, input_: torch.Tensor) -> torch.Tensor: res5c_output = None - def avg_pool_hook( - _module: torch.nn.Module, input_: Tuple[torch.Tensor], _output: Any - ) -> None: - nonlocal res5c_output - res5c_output = input_[0] + def get_features(name_): + def hook( + _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: Any + ): + nonlocal res5c_output + res5c_output = output + + return hook - resnet.avgpool.register_forward_hook(avg_pool_hook) + if self.layer_name: + for name, layer in resnet.named_modules(): + if name == self.layer_name: + layer.register_forward_hook(get_features(name)) + break for instance in torch.utils.data.DataLoader(dataset): video_id = instance["id"][0] frames = instance["frames"][0].to(DEVICE) - embeddings[video_id] = torch.empty((len(frames), 2048)) - batch_size = 32 + embeddings[video_id] = [] + batch_size = 64 + for start_index in range(0, len(frames), batch_size): end_index = min(start_index + batch_size, len(frames)) frame_ids_range = range(start_index, end_index) frame_batch = frames[frame_ids_range] - avg_pool_value = resnet(frame_batch) + _ = resnet(frame_batch) + values = res5c_output + + if self.layer_name == "avgpool" or self.layer_name == "maxpool": + embeddings[video_id].extend( + torch.flatten(values, 1).detach().cpu().numpy() + ) - embeddings[video_id][frame_ids_range] = avg_pool_value.to(DEVICE) + else: + pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1)) + + embeddings[video_id].extend( + torch.flatten(pooled, 1).detach().cpu().numpy() + ) if self.output_file is not None: with h5py.File(self.output_file, "w") as hdf: for key, value in embeddings.items(): hdf.create_dataset(key, data=value) - emb = np.zeros((len(indices), 2048), dtype="float32") - if indices is not None: - for i in indices: - emb[i] = embeddings.get(str(i)).mean(dim=0).numpy() - else: - for i, key in enumerate(embeddings.keys()): - emb[i] = embeddings.get(key).mean(dim=0).numpy() - - return emb - - @staticmethod - def extract_features_from_video(video_path, model, transform): - cap = cv2.VideoCapture(video_path) - features = [] - count = 0 - success, frame = cap.read() - - while success: - success, frame = cap.read() - transformed_frame = transform(frame).unsqueeze(0) + emb = [] - with torch.no_grad(): - feature_vector = model(transformed_frame) - feature_vector = feature_vector.view(-1).numpy() + for video in embeddings.values(): + emb.append(np.array(video).mean(axis=0).tolist()) - features.append(feature_vector) - - count += 1 - - cap.release() - return features, count + return np.array(emb) class ResNetDataset(torch.utils.data.Dataset): - def __init__(self, video_folder_path: str, transform: Callable = None): - self.video_folder_path = video_folder_path - self.transform = transform - self.video_ids = [] - video_files = [ - f - for f in os.listdir(self.video_folder_path) - if f.lower().endswith((".mp4", ".avi", ".mov", ".mkv")) - ] - self.file_extension = video_files[0].split(".")[-1] - - for video in video_files: - video_id, _ = video.split("/")[-1].split(".") - self.video_ids.append(video_id) - - self.frame_count_by_video_id = {video_id: 0 for video_id in self.video_ids} + def __init__(self, data: str, tf: Callable = None): + self.data = data + self.tf = tf def __getitem__(self, index) -> Dict[str, object]: - video_id = self.video_ids[index] - video_path = self.video_folder_path + "/" + video_id + "." + self.file_extension - - frames = None - count = 0 - - cap = cv2.VideoCapture(video_path) - - success, frame = cap.read() - - num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - self.frame_count_by_video_id[video_id] = num_frames - if frames is None and success: - frames = torch.empty((num_frames, 3, 224, 224)) - - while success: - frame = self.transform(frame) - frames[count] = frame # noqa - success, frame = cap.read() - count += 1 + video = self.data[index] + frames = torch.empty((len(video), 3, 224, 224)) - cap.release() - return {"id": video_id, "frames": frames} + for i, frame in enumerate(video): + frames[i] = self.tf(frame) + return {"id": index, "frames": frames} def __len__(self) -> int: - return len(self.video_ids) + return len(self.data) diff --git a/src/main/python/systemds/scuro/representations/rowmax.py b/src/main/python/systemds/scuro/representations/rowmax.py index 0dc201e2ee1..31527820269 100644 --- a/src/main/python/systemds/scuro/representations/rowmax.py +++ b/src/main/python/systemds/scuro/representations/rowmax.py @@ -23,10 +23,10 @@ import numpy as np -from modality.modality import Modality +from systemds.scuro.modality.modality import Modality from systemds.scuro.representations.utils import pad_sequences -from representations.fusion import Fusion +from systemds.scuro.representations.fusion import Fusion class RowMax(Fusion): @@ -38,7 +38,7 @@ def __init__(self, split=1): super().__init__("RowMax") self.split = split - def fuse(self, modalities: List[Modality], train_indices): + def transform(self, modalities: List[Modality]): if len(modalities) < 2: return np.array(modalities) @@ -46,8 +46,7 @@ def fuse(self, modalities: List[Modality], train_indices): padded_modalities = [] for modality in modalities: - scaled = self.scale_data(modality.data, train_indices) - d = pad_sequences(scaled, maxlen=max_emb_size, dtype="float32") + d = pad_sequences(modality.data, maxlen=max_emb_size, dtype="float32") padded_modalities.append(d) split_rows = int(len(modalities[0].data) / self.split) diff --git a/src/main/python/systemds/scuro/representations/sum.py b/src/main/python/systemds/scuro/representations/sum.py index bfb19d4f7d6..0608338a0fd 100644 --- a/src/main/python/systemds/scuro/representations/sum.py +++ b/src/main/python/systemds/scuro/representations/sum.py @@ -35,7 +35,7 @@ def __init__(self): """ super().__init__("Sum") - def fuse(self, modalities: List[Modality]): + def transform(self, modalities: List[Modality]): max_emb_size = self.get_max_embedding_size(modalities) data = pad_sequences(modalities[0].data, maxlen=max_emb_size, dtype="float32") diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py index 15515dd5386..4849aba1360 100644 --- a/src/main/python/systemds/scuro/representations/tfidf.py +++ b/src/main/python/systemds/scuro/representations/tfidf.py @@ -19,7 +19,6 @@ # # ------------------------------------------------------------- -import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from systemds.scuro.representations.unimodal import UnimodalRepresentation @@ -32,17 +31,13 @@ def __init__(self, min_df, output_file=None): self.min_df = min_df self.output_file = output_file - def parse_all(self, filepath, indices): + def transform(self, data): vectorizer = TfidfVectorizer(min_df=self.min_df) - segments = read_data_from_file(filepath, indices) - X = vectorizer.fit_transform(segments.values()) + X = vectorizer.fit_transform(data) X = X.toarray() if self.output_file is not None: - df = pd.DataFrame(X) - df.index = segments.keys() - - save_embeddings(df, self.output_file) + save_embeddings(X, self.output_file) return X diff --git a/src/main/python/systemds/scuro/representations/unimodal.py b/src/main/python/systemds/scuro/representations/unimodal.py index ccd61977654..c56d611a744 100644 --- a/src/main/python/systemds/scuro/representations/unimodal.py +++ b/src/main/python/systemds/scuro/representations/unimodal.py @@ -29,7 +29,7 @@ def __init__(self, name): """ super().__init__(name) - def parse_all(self, file_path, indices): + def transform(self, data): raise f"Not implemented for {self.name}" diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py index cc8a180889d..209091648d5 100644 --- a/src/main/python/systemds/scuro/representations/word2vec.py +++ b/src/main/python/systemds/scuro/representations/word2vec.py @@ -21,10 +21,9 @@ import numpy as np from systemds.scuro.representations.unimodal import UnimodalRepresentation -from systemds.scuro.representations.utils import read_data_from_file, save_embeddings +from systemds.scuro.representations.utils import save_embeddings from gensim.models import Word2Vec from nltk.tokenize import word_tokenize -import nltk def get_embedding(sentence, model): @@ -44,22 +43,20 @@ def __init__(self, vector_size, min_count, window, output_file=None): self.window = window self.output_file = output_file - def parse_all(self, filepath, indices): - segments = read_data_from_file(filepath, indices) - embeddings = {} - t = [word_tokenize(s.lower()) for s in segments.values()] + def transform(self, data): + t = [word_tokenize(s.lower()) for s in data] model = Word2Vec( sentences=t, vector_size=self.vector_size, window=self.window, min_count=self.min_count, ) - - for k, v in segments.items(): - tokenized_words = word_tokenize(v.lower()) - embeddings[k] = get_embedding(tokenized_words, model) + embeddings = [] + for sentences in data: + tokens = word_tokenize(sentences.lower()) + embeddings.append(get_embedding(tokens, model)) if self.output_file is not None: - save_embeddings(embeddings, self.output_file) + save_embeddings(np.array(embeddings), self.output_file) - return np.array(list(embeddings.values())) + return np.array(embeddings) diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py index 9f5b8dd2d73..7f8fc80aeb2 100644 --- a/src/main/python/tests/scuro/data_generator.py +++ b/src/main/python/tests/scuro/data_generator.py @@ -23,10 +23,7 @@ from scipy.io.wavfile import write import random import os - -from systemds.scuro.modality.video_modality import VideoModality -from systemds.scuro.modality.audio_modality import AudioModality -from systemds.scuro.modality.text_modality import TextModality +from systemds.scuro.modality.type import ModalityType class TestDataGenerator: @@ -36,7 +33,7 @@ def __init__(self, modalities, path, balanced=True): self.balanced = balanced for modality in modalities: - mod_path = f"{self.path}/{modality.name.lower()}/" + mod_path = f"{self.path}/{modality.type.name}/" os.mkdir(mod_path) modality.file_path = mod_path self.labels = [] @@ -72,11 +69,11 @@ def create_multimodal_data(self, num_instances, duration=2, seed=42): speed_slow += 1 for modality in self.modalities: - if isinstance(modality, VideoModality): + if modality.type == ModalityType.VIDEO: self.__create_video_data(idx, duration, 30, speed_factor) - if isinstance(modality, AudioModality): + if modality.type == ModalityType.AUDIO: self.__create_audio_data(idx, duration, speed_factor) - if isinstance(modality, TextModality): + if modality.type == ModalityType.TEXT: self.__create_text_data(idx, speed_factor) np.save(f"{self.path}/labels.npy", np.array(self.labels)) diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py index cbbeafab8a6..5253b6dbc8a 100644 --- a/src/main/python/tests/scuro/test_data_loaders.py +++ b/src/main/python/tests/scuro/test_data_loaders.py @@ -22,15 +22,17 @@ import os import shutil import unittest -from systemds.scuro.modality.audio_modality import AudioModality -from systemds.scuro.modality.text_modality import TextModality -from systemds.scuro.modality.video_modality import VideoModality +from systemds.scuro.modality.unimodal_modality import UnimodalModality from systemds.scuro.representations.bert import Bert from systemds.scuro.representations.mel_spectrogram import MelSpectrogram from systemds.scuro.representations.resnet import ResNet -from systemds.scuro.representations.representation_dataloader import HDF5, NPY, Pickle from tests.scuro.data_generator import TestDataGenerator +from systemds.scuro.dataloader.audio_loader import AudioLoader +from systemds.scuro.dataloader.video_loader import VideoLoader +from systemds.scuro.dataloader.text_loader import TextLoader +from systemds.scuro.modality.type import ModalityType + class TestDataLoaders(unittest.TestCase): test_file_path = None @@ -53,28 +55,22 @@ def setUpClass(cls): cls.num_instances = 2 cls.indizes = [str(i) for i in range(0, cls.num_instances)] - cls.video = VideoModality( - "", ResNet(f"{cls.test_file_path}/embeddings/resnet_embeddings.hdf5") - ) - cls.audio = AudioModality( - "", - MelSpectrogram( - output_file=f"{cls.test_file_path}/embeddings/mel_sp_embeddings.npy" - ), - ) - cls.text = TextModality( - "", - Bert( - avg_layers=4, - output_file=f"{cls.test_file_path}/embeddings/bert_embeddings.pkl", - ), - ) - cls.mods = [cls.video, cls.audio, cls.text] + + video_data_loader = VideoLoader(cls.test_file_path + "/video/", cls.indizes) + audio_data_loader = AudioLoader(cls.test_file_path + "/audio/", cls.indizes) + text_data_loader = TextLoader(cls.test_file_path + "/text/", cls.indizes) + + # Load modalities (audio, video, text) + video = UnimodalModality(video_data_loader, ModalityType.VIDEO) + audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO) + text = UnimodalModality(text_data_loader, ModalityType.TEXT) + + cls.mods = [video, audio, text] cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path) cls.data_generator.create_multimodal_data(cls.num_instances) - cls.text.read_all(cls.indizes) - cls.audio.read_all(cls.indizes) - cls.video.read_all([i for i in range(0, cls.num_instances)]) + cls.text_ref = text.apply_representation(Bert()) + cls.audio_ref = audio.apply_representation(MelSpectrogram()) + cls.video_ref = video.apply_representation(ResNet()) @classmethod def tearDownClass(cls): @@ -82,35 +78,31 @@ def tearDownClass(cls): shutil.rmtree(cls.test_file_path) def test_load_audio_data_from_file(self): - load_audio = AudioModality( - f"{self.test_file_path}/embeddings/mel_sp_embeddings.npy", NPY() - ) - load_audio.read_all(self.indizes) + audio_data_loader = AudioLoader(self.test_file_path + "/audio/", self.indizes) + audio = UnimodalModality( + audio_data_loader, ModalityType.AUDIO + ).apply_representation(MelSpectrogram()) for i in range(0, self.num_instances): - assert round(sum(self.audio.data[i]), 4) == round( - sum(load_audio.data[i]), 4 - ) + assert round(sum(self.audio_ref.data[i]), 4) == round(sum(audio.data[i]), 4) def test_load_video_data_from_file(self): - load_video = VideoModality( - f"{self.test_file_path}/embeddings/resnet_embeddings.hdf5", HDF5() - ) - load_video.read_all(self.indizes) + video_data_loader = VideoLoader(self.test_file_path + "/video/", self.indizes) + video = UnimodalModality( + video_data_loader, ModalityType.VIDEO + ).apply_representation(ResNet()) for i in range(0, self.num_instances): - assert round(sum(self.video.data[i]), 4) == round( - sum(load_video.data[i]), 4 - ) + assert round(sum(self.video_ref.data[i]), 4) == round(sum(video.data[i]), 4) def test_load_text_data_from_file(self): - load_text = TextModality( - f"{self.test_file_path}/embeddings/bert_embeddings.pkl", Pickle() - ) - load_text.read_all(self.indizes) + text_data_loader = TextLoader(self.test_file_path + "/text/", self.indizes) + text = UnimodalModality( + text_data_loader, ModalityType.TEXT + ).apply_representation(Bert()) for i in range(0, self.num_instances): - assert round(sum(self.text.data[i]), 4) == round(sum(load_text.data[i]), 4) + assert round(sum(self.text_ref.data[i]), 4) == round(sum(text.data[i]), 4) if __name__ == "__main__": diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py index eac4a776413..f2778884339 100644 --- a/src/main/python/tests/scuro/test_dr_search.py +++ b/src/main/python/tests/scuro/test_dr_search.py @@ -28,11 +28,13 @@ from sklearn.model_selection import train_test_split, KFold from sklearn.preprocessing import MinMaxScaler +from systemds.scuro.modality.unimodal_modality import UnimodalModality +from systemds.scuro.modality.type import ModalityType +from systemds.scuro.dataloader.text_loader import TextLoader +from systemds.scuro.dataloader.audio_loader import AudioLoader +from systemds.scuro.dataloader.video_loader import VideoLoader from systemds.scuro.aligner.dr_search import DRSearch from systemds.scuro.aligner.task import Task -from systemds.scuro.modality.audio_modality import AudioModality -from systemds.scuro.modality.text_modality import TextModality -from systemds.scuro.modality.video_modality import VideoModality from systemds.scuro.models.model import Model from systemds.scuro.representations.average import Average from systemds.scuro.representations.bert import Bert @@ -101,28 +103,20 @@ def setUpClass(cls): cls.num_instances = 8 cls.indizes = [str(i) for i in range(0, cls.num_instances)] - cls.video = VideoModality( - "", ResNet(f"{cls.test_file_path}/embeddings/resnet_embeddings.hdf5") - ) - cls.audio = AudioModality( - "", - MelSpectrogram( - output_file=f"{cls.test_file_path}/embeddings/mel_sp_embeddings.npy" - ), - ) - cls.text = TextModality( - "", - Bert( - avg_layers=4, - output_file=f"{cls.test_file_path}/embeddings/bert_embeddings.pkl", - ), - ) - cls.mods = [cls.video, cls.audio, cls.text] - cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path) + video_data_loader = VideoLoader(cls.test_file_path + "/video/", cls.indizes) + audio_data_loader = AudioLoader(cls.test_file_path + "/audio/", cls.indizes) + text_data_loader = TextLoader(cls.test_file_path + "/text/", cls.indizes) + video = UnimodalModality(video_data_loader, ModalityType.VIDEO) + audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO) + text = UnimodalModality(text_data_loader, ModalityType.TEXT) + cls.data_generator = TestDataGenerator([video, audio, text], cls.test_file_path) cls.data_generator.create_multimodal_data(cls.num_instances) - cls.text.read_all(cls.indizes) - cls.audio.read_all(cls.indizes) - cls.video.read_all([i for i in range(0, cls.num_instances)]) + + cls.bert = text.apply_representation(Bert()) + cls.mel_spe = audio.apply_representation(MelSpectrogram()) + cls.resnet = video.apply_representation(ResNet()) + + cls.mods = [cls.bert, cls.mel_spe, cls.resnet] split = train_test_split( cls.indizes, cls.data_generator.labels, test_size=0.2, random_state=42