diff --git a/src/main/python/systemds/scuro/__init__.py b/src/main/python/systemds/scuro/__init__.py
index 84494a158ef..53b68d430fa 100644
--- a/src/main/python/systemds/scuro/__init__.py
+++ b/src/main/python/systemds/scuro/__init__.py
@@ -18,59 +18,60 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.json_loader import JSONLoader
 from systemds.scuro.representations.representation import Representation
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.concatenation import Concatenation
-from systemds.scuro.representations.fusion import Fusion
 from systemds.scuro.representations.sum import Sum
 from systemds.scuro.representations.max import RowMax
 from systemds.scuro.representations.multiplication import Multiplication
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.resnet import ResNet
 from systemds.scuro.representations.bert import Bert
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from systemds.scuro.representations.lstm import LSTM
-from systemds.scuro.representations.representation_dataloader import (
-    NPY,
-    Pickle,
-    HDF5,
-    JSON,
-)
+from systemds.scuro.representations.bow import BoW
+from systemds.scuro.representations.glove import GloVe
+from systemds.scuro.representations.tfidf import TfIdf
+from systemds.scuro.representations.word2vec import W2V
 from systemds.scuro.models.model import Model
 from systemds.scuro.models.discrete_model import DiscreteModel
-from systemds.scuro.modality.aligned_modality import AlignedModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.text_modality import TextModality
 from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.modality.type import ModalityType
 from systemds.scuro.aligner.dr_search import DRSearch
 from systemds.scuro.aligner.task import Task
 
 
 __all__ = [
+    "BaseLoader",
+    "AudioLoader",
+    "VideoLoader",
+    "TextLoader",
     "Representation",
     "Average",
     "Concatenation",
-    "Fusion",
     "Sum",
     "RowMax",
     "Multiplication",
     "MelSpectrogram",
     "ResNet",
     "Bert",
-    "UnimodalRepresentation",
     "LSTM",
-    "NPY",
-    "Pickle",
-    "HDF5",
-    "JSON",
+    "BoW",
+    "GloVe",
+    "TfIdf",
+    "W2V",
     "Model",
     "DiscreteModel",
-    "AlignedModality",
-    "AudioModality",
-    "VideoModality",
-    "TextModality",
     "Modality",
+    "UnimodalModality",
+    "TransformedModality",
+    "ModalityType",
     "DRSearch",
     "Task",
 ]
diff --git a/src/main/python/systemds/scuro/aligner/alignment.py b/src/main/python/systemds/scuro/aligner/alignment.py
index e341e1b76bf..62f88a272b9 100644
--- a/src/main/python/systemds/scuro/aligner/alignment.py
+++ b/src/main/python/systemds/scuro/aligner/alignment.py
@@ -19,7 +19,6 @@
 #
 # -------------------------------------------------------------
 from aligner.alignment_strategy import AlignmentStrategy
-from modality.aligned_modality import AlignedModality
 from modality.modality import Modality
 from modality.representation import Representation
 from aligner.similarity_measures import Measure
@@ -46,4 +45,4 @@ def __init__(
         self.similarity_measure = similarity_measure
 
     def align_modalities(self) -> Modality:
-        return AlignedModality(Representation())
+        return Modality(Representation())
diff --git a/src/main/python/systemds/scuro/aligner/dr_search.py b/src/main/python/systemds/scuro/aligner/dr_search.py
index 24f3c3236f5..b46139dff30 100644
--- a/src/main/python/systemds/scuro/aligner/dr_search.py
+++ b/src/main/python/systemds/scuro/aligner/dr_search.py
@@ -23,7 +23,6 @@
 from typing import List
 
 from systemds.scuro.aligner.task import Task
-from systemds.scuro.modality.aligned_modality import AlignedModality
 from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.representation import Representation
 
@@ -64,27 +63,25 @@ def __init__(
 
     def set_best_params(
         self,
-        modality_name: str,
         representation: Representation,
         scores: List[float],
         modality_names: List[str],
     ):
         """
         Updates the best parameters for given modalities, representation, and score
-        :param modality_name: The name of the aligned modality
         :param representation: The representation used to retrieve the current score
-        :param score: achieved score for the set of modalities and representation
+        :param scores: achieved train/test scores for the set of modalities and representation
         :param modality_names: List of modality names used in this setting
         :return:
         """
 
         # check if modality name is already in dictionary
-        if modality_name not in self.scores.keys():
+        if "_".join(modality_names) not in self.scores.keys():
             # if not add it to dictionary
-            self.scores[modality_name] = {}
+            self.scores["_".join(modality_names)] = {}
 
         # set score for representation
-        self.scores[modality_name][representation] = scores
+        self.scores["_".join(modality_names)][representation] = scores
 
         # compare current score with best score
         if scores[1] > self.best_score:
@@ -113,13 +110,12 @@ def fit_random(self, seed=-1):
         modality_combination = random.choice(modalities)
         representation = random.choice(self.representations)
 
-        modality = AlignedModality(representation, list(modality_combination))  # noqa
-        modality.combine()
+        modality = modality_combination[0].combine(
+            modality_combination[1:], representation
+        )
 
         scores = self.task.run(modality.data)
-        self.set_best_params(
-            modality.name, representation, scores, modality.get_modality_names()
-        )
+        self.set_best_params(representation, scores, modality.get_modality_names())
 
         return self.best_representation, self.best_score, self.best_modalities
 
@@ -133,14 +129,14 @@ def fit_enumerate_all(self):
         for M in range(1, len(self.modalities) + 1):
             for combination in itertools.combinations(self.modalities, M):
                 for representation in self.representations:
-                    modality = AlignedModality(
-                        representation, list(combination)
-                    )  # noqa
-                    modality.combine()
+                    modality = combination[0]
+                    if len(combination) > 1:
+                        modality = combination[0].combine(
+                            list(combination[1:]), representation
+                        )
 
                     scores = self.task.run(modality.data)
                     self.set_best_params(
-                        modality.name,
                         representation,
                         scores,
                         modality.get_modality_names(),
@@ -164,7 +160,8 @@ def transform(self, modalities: List[Modality]):
         for modality_name in self.best_modalities:
             used_modalities.append(get_modalities_by_name(modalities, modality_name))
 
-        modality = AlignedModality(self.best_representation, used_modalities)  # noqa
-        modality.combine(self.task.train_indices)
+        modality = used_modalities[0].combine(
+            used_modalities[1:], self.best_representation
+        )
 
         return modality.data
diff --git a/src/main/python/systemds/scuro/dataloader/__init__.py b/src/main/python/systemds/scuro/dataloader/__init__.py
new file mode 100644
index 00000000000..e66abb4646f
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/__init__.py
@@ -0,0 +1,20 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
diff --git a/src/main/python/systemds/scuro/dataloader/audio_loader.py b/src/main/python/systemds/scuro/dataloader/audio_loader.py
new file mode 100644
index 00000000000..f85b1b80faa
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/audio_loader.py
@@ -0,0 +1,39 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional
+
+import librosa
+from systemds.scuro.dataloader.base_loader import BaseLoader
+
+
+class AudioLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        chunk_size: Optional[int] = None,
+    ):
+        super().__init__(source_path, indices, chunk_size)
+
+    def extract(self, file: str):
+        self.file_sanity_check(file)
+        audio, sr = librosa.load(file)
+        self.data.append(audio)
diff --git a/src/main/python/systemds/scuro/dataloader/base_loader.py b/src/main/python/systemds/scuro/dataloader/base_loader.py
new file mode 100644
index 00000000000..2ef60677c67
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/base_loader.py
@@ -0,0 +1,92 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import os
+from abc import ABC, abstractmethod
+from typing import List, Optional, Union
+
+
+class BaseLoader(ABC):
+    def __init__(
+        self, source_path: str, indices: List[str], chunk_size: Optional[int] = None
+    ):
+        """
+        Base class to load raw data for a given list of indices and stores them in the data object
+        :param source_path: The location where the raw data lies
+        :param indices: A list of indices as strings that are corresponding to the file names
+        :param chunk_size: An optional argument to load the data in chunks instead of all at once
+        (otherwise please provide your own Dataloader that knows about the file name convention)
+        """
+        self.data = []
+        self.source_path = source_path
+        self.indices = indices
+        self.chunk_size = chunk_size
+        self.next_chunk = 0
+
+        if self.chunk_size:
+            self.num_chunks = int(len(self.indices) / self.chunk_size)
+
+    def load(self):
+        """
+        Takes care of loading the raw data either chunk wise (if chunk size is defined) or all at once
+        """
+        if self.chunk_size:
+            return self._load_next_chunk()
+
+        return self._load(self.indices)
+
+    def _load_next_chunk(self):
+        """
+        Loads the next chunk of data
+        """
+        self.data = []
+        next_chunk_indices = self.indices[
+            self.next_chunk * self.chunk_size : (self.next_chunk + 1) * self.chunk_size
+        ]
+        self.next_chunk += 1
+        return self._load(next_chunk_indices)
+
+    def _load(self, indices: List[str]):
+        is_dir = True if os.path.isdir(self.source_path) else False
+
+        if is_dir:
+            _, ext = os.path.splitext(os.listdir(self.source_path)[0])
+            for index in indices:
+                self.extract(self.source_path + index + ext)
+        else:
+            self.extract(self.source_path, indices)
+
+        return self.data
+
+    @abstractmethod
+    def extract(self, file: str, index: Optional[Union[str, List[str]]] = None):
+        pass
+
+    def file_sanity_check(self, file):
+        """
+        Checks if the file can be found is not empty
+        """
+        try:
+            file_size = os.path.getsize(file)
+        except:
+            raise (f"Error: File {0} not found!".format(file))
+
+        if file_size == 0:
+            raise ("File {0} is empty".format(file))
diff --git a/src/main/python/systemds/scuro/dataloader/json_loader.py b/src/main/python/systemds/scuro/dataloader/json_loader.py
new file mode 100644
index 00000000000..c4e3b956111
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/json_loader.py
@@ -0,0 +1,43 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+import json
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from typing import Optional, List
+
+
+class JSONLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        field: str,
+        chunk_size: Optional[int] = None,
+    ):
+        super().__init__(source_path, indices, chunk_size)
+        self.field = field
+
+    def extract(self, file: str, indices: List[str]):
+        self.file_sanity_check(file)
+        with open(file) as f:
+            json_file = json.load(f)
+            for idx in indices:
+                self.data.append(json_file[idx][self.field])
diff --git a/src/main/python/systemds/scuro/dataloader/text_loader.py b/src/main/python/systemds/scuro/dataloader/text_loader.py
new file mode 100644
index 00000000000..f614472bce6
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/text_loader.py
@@ -0,0 +1,44 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from typing import Optional, Pattern, List
+import re
+
+
+class TextLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        chunk_size: Optional[int] = None,
+        prefix: Optional[Pattern[str]] = None,
+    ):
+        super().__init__(source_path, indices, chunk_size)
+        self.prefix = prefix
+
+    def extract(self, file: str):
+        self.file_sanity_check(file)
+        with open(file) as text_file:
+            for i, line in enumerate(text_file):
+                if self.prefix:
+                    line = re.sub(self.prefix, "", line)
+                line = line.replace("\n", "")
+                self.data.append(line)
diff --git a/src/main/python/systemds/scuro/dataloader/video_loader.py b/src/main/python/systemds/scuro/dataloader/video_loader.py
new file mode 100644
index 00000000000..6da20b34756
--- /dev/null
+++ b/src/main/python/systemds/scuro/dataloader/video_loader.py
@@ -0,0 +1,52 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from typing import List, Optional
+
+import numpy as np
+
+from systemds.scuro.dataloader.base_loader import BaseLoader
+import cv2
+
+
+class VideoLoader(BaseLoader):
+    def __init__(
+        self,
+        source_path: str,
+        indices: List[str],
+        chunk_size: Optional[int] = None,
+    ):
+        super().__init__(source_path, indices, chunk_size)
+
+    def extract(self, file: str):
+        self.file_sanity_check(file)
+        cap = cv2.VideoCapture(file)
+        frames = []
+        while cap.isOpened():
+            ret, frame = cap.read()
+
+            if not ret:
+                break
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = frame.astype(np.float32) / 255.0
+
+            frames.append(frame)
+
+        self.data.append(frames)
diff --git a/src/main/python/systemds/scuro/main.py b/src/main/python/systemds/scuro/main.py
index f28b271b979..8a51e098cc5 100644
--- a/src/main/python/systemds/scuro/main.py
+++ b/src/main/python/systemds/scuro/main.py
@@ -18,21 +18,20 @@
 # under the License.
 #
 # -------------------------------------------------------------
-import collections
-import json
-from datetime import datetime
-
+from systemds.scuro.representations.bert import Bert
+from systemds.scuro.representations.resnet import ResNet
+from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.concatenation import Concatenation
-from systemds.scuro.modality.aligned_modality import AlignedModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.representations.unimodal import Pickle, JSON, HDF5, NPY
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.models.discrete_model import DiscreteModel
 from systemds.scuro.aligner.task import Task
 from systemds.scuro.aligner.dr_search import DRSearch
 
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+
 
 class CustomTask(Task):
     def __init__(self, model, labels, train_indices, val_indices):
@@ -49,18 +48,32 @@ def run(self, data):
 train_indices = []
 val_indices = []
 
+all_indices = []
+
 video_path = ""
 audio_path = ""
 text_path = ""
 
+
+# Define dataloaders
+video_data_loader = VideoLoader(video_path, all_indices, chunk_size=10)
+text_data_loader = TextLoader(text_path, all_indices)
+audio_data_loader = AudioLoader(audio_path, all_indices)
+
 # Load modalities (audio, video, text)
-video = VideoModality(video_path, HDF5(), train_indices)
-audio = AudioModality(audio_path, Pickle(), train_indices)
-text = TextModality(text_path, NPY(), train_indices)
+video = UnimodalModality(video_data_loader, "VIDEO")
+audio = UnimodalModality(audio_data_loader, "AUDIO")
+text = UnimodalModality(text_data_loader, "TEXT")
+
+# Define unimodal representations
+r_v = ResNet()
+r_a = MelSpectrogram()
+r_t = Bert()
 
-video.read_all()
-audio.read_all()
-text.read_all()
+# Transform raw unimodal data
+video.apply_representation(r_v)
+audio.apply_representation(r_a)
+text.apply_representation(r_t)
 
 modalities = [text, audio, video]
 
diff --git a/src/main/python/systemds/scuro/modality/aligned_modality.py b/src/main/python/systemds/scuro/modality/aligned_modality.py
deleted file mode 100644
index 839b9d296f8..00000000000
--- a/src/main/python/systemds/scuro/modality/aligned_modality.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-from typing import List
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.fusion import Fusion
-
-
-class AlignedModality(Modality):
-    def __init__(self, representation: Fusion, modalities: List[Modality]):
-        """
-        Defines the modality that is created during the fusion process
-        :param representation: The representation for the aligned modality
-        :param modalities: List of modalities to be combined
-        """
-        name = ""
-        for modality in modalities:
-            name += modality.name
-        super().__init__(representation, modality_name=name)
-        self.modalities = modalities
-
-    def combine(self):
-        """
-        Initiates the call to fuse the given modalities depending on the Fusion type
-        """
-        self.data = self.representation.fuse(self.modalities)  # noqa
-
-    def get_modality_names(self):
-        names = []
-        for modality in self.modalities:
-            names.append(modality.name)
-
-        return names
diff --git a/src/main/python/systemds/scuro/modality/audio_modality.py b/src/main/python/systemds/scuro/modality/audio_modality.py
deleted file mode 100644
index ba849622269..00000000000
--- a/src/main/python/systemds/scuro/modality/audio_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class AudioModality(Modality):
-    def __init__(
-        self,
-        file_path: str,
-        representation: UnimodalRepresentation,
-        train_indices=None,
-        start_index: int = 0,
-    ):
-        """
-        Creates an audio modality
-        :param file_path: path to file where the audio embeddings are stored
-        :param representation: Unimodal representation that indicates how to extract the data from the file
-        """
-        super().__init__(representation, start_index, "Audio", train_indices)
-        self.file_path = file_path
-
-    def file_sanity_check(self):
-        """
-        Checks if the file can be found is not empty
-        """
-        try:
-            file_size = os.path.getsize(self.file_path)
-        except:
-            raise (f"Error: File {0} not found!".format(self.file_path))
-
-        if file_size == 0:
-            raise ("File {0} is empty".format(self.file_path))
-
-    def read_chunk(self):
-        pass
-
-    def read_all(self, indices=None):
-        self.data = self.representation.parse_all(
-            self.file_path, indices=indices
-        )  # noqa
diff --git a/src/main/python/systemds/scuro/modality/modality.py b/src/main/python/systemds/scuro/modality/modality.py
index a899576d5b8..9a3d1b148d2 100644
--- a/src/main/python/systemds/scuro/modality/modality.py
+++ b/src/main/python/systemds/scuro/modality/modality.py
@@ -18,41 +18,27 @@
 # under the License.
 #
 # -------------------------------------------------------------
+from typing import List
 
-from systemds.scuro.representations.representation import Representation
+from systemds.scuro.modality.type import ModalityType
 
 
 class Modality:
 
-    def __init__(
-        self,
-        representation: Representation,
-        start_index: int = 0,
-        modality_name="",
-        train_indices=None,
-    ):
+    def __init__(self, modality_type: ModalityType):
         """
-        Parent class of the different Modalities
-        :param representation: Specifies how the data should be represented for a specific modality
-        :param start_index: Defines the first index used for the alignment
-        :param modality_name: Name of the modality
-        :param train_indices: List of indices used for train-test split
+        Parent class of the different Modalities (unimodal & multimodal)
+        :param modality_type: Type of the modality
         """
-        self.representation = representation
-        self.start_index = start_index
-        self.name = modality_name
+        self.type = modality_type
         self.data = None
-        self.train_indices = train_indices
+        self.data_type = None
+        self.cost = None
+        self.shape = None
+        self.schema = {}
 
-    def read_chunk(self):
+    def get_modality_names(self) -> List[str]:
         """
-        Extracts a data chunk of the modality according to the window size specified in params
+        Extracts the individual unimodal modalities for a given transformed modality.
         """
-        raise NotImplementedError
-
-    def read_all(self, indices):
-        """
-        Implemented for every unique modality to read all samples from a specified format
-        :param indices: List of indices to be read
-        """
-        pass
+        return [modality.name for modality in ModalityType if modality in self.type]
diff --git a/src/main/python/systemds/scuro/modality/text_modality.py b/src/main/python/systemds/scuro/modality/text_modality.py
deleted file mode 100644
index c636de71672..00000000000
--- a/src/main/python/systemds/scuro/modality/text_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class TextModality(Modality):
-    def __init__(
-        self,
-        file_path: str,
-        representation: UnimodalRepresentation,
-        train_indices=None,
-        start_index: int = 0,
-    ):
-        """
-        Creates a text modality
-        :param file_path: path to file(s) where the text data is stored
-        :param representation: Unimodal representation that indicates how to extract the data from the file
-        """
-        super().__init__(representation, start_index, "Text", train_indices)
-        self.file_path = file_path
-
-    def file_sanity_check(self):
-        """
-        Checks if the file can be found is not empty
-        """
-        try:
-            file_size = os.path.getsize(self.file_path)
-        except:
-            raise (f"Error: File {0} not found!".format(self.file_path))
-
-        if file_size == 0:
-            raise ("File {0} is empty".format(self.file_path))
-
-    def read_chunk(self):
-        pass
-
-    def read_all(self, indices=None):
-        self.data = self.representation.parse_all(
-            self.file_path, indices=indices
-        )  # noqa
diff --git a/src/main/python/systemds/scuro/modality/transformed.py b/src/main/python/systemds/scuro/modality/transformed.py
new file mode 100644
index 00000000000..61c327e469e
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/transformed.py
@@ -0,0 +1,52 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from functools import reduce
+from operator import or_
+
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.type import ModalityType
+
+
+class TransformedModality(Modality):
+
+    def __init__(self, modality_type: ModalityType, transformation):
+        """
+        Parent class of the different Modalities (unimodal & multimodal)
+        :param modality_type: Type of the original modality(ies)
+        :param transformation: Representation to be applied on the modality
+        """
+        super().__init__(modality_type)
+        self.transformation = transformation
+
+    def combine(self, other, fusion_method):
+        """
+        Combines two or more modalities with each other using a dedicated fusion method
+        :param other: The modality to be combined
+        :param fusion_method: The fusion method to be used to combine modalities
+        """
+        fused_modality = TransformedModality(
+            reduce(or_, (o.type for o in other), self.type), fusion_method
+        )
+        modalities = [self]
+        modalities.extend(other)
+        fused_modality.data = fusion_method.transform(modalities)
+
+        return fused_modality
diff --git a/src/main/python/systemds/scuro/modality/type.py b/src/main/python/systemds/scuro/modality/type.py
new file mode 100644
index 00000000000..c451eea6f1d
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/type.py
@@ -0,0 +1,31 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from enum import Enum, Flag, auto
+
+
+class ModalityType(Flag):
+    TEXT = auto()
+    AUDIO = auto()
+    VIDEO = auto()
+
+    # def __init__(self, value, name):
+    #     self._value_ = value
+    #     self.name = name
diff --git a/src/main/python/systemds/scuro/modality/unimodal_modality.py b/src/main/python/systemds/scuro/modality/unimodal_modality.py
new file mode 100644
index 00000000000..976d4194d47
--- /dev/null
+++ b/src/main/python/systemds/scuro/modality/unimodal_modality.py
@@ -0,0 +1,59 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+from systemds.scuro.dataloader.base_loader import BaseLoader
+from systemds.scuro.modality.modality import Modality
+from systemds.scuro.modality.transformed import TransformedModality
+from systemds.scuro.modality.type import ModalityType
+
+
+class UnimodalModality(Modality):
+
+    def __init__(self, data_loader: BaseLoader, modality_type: ModalityType):
+        """
+        This class represents a unimodal modality.
+        :param data_loader: Defines how the raw data should be loaded
+        :param modality_type: Type of the modality
+        """
+        super().__init__(modality_type)
+        self.data_loader = data_loader
+
+    def extract_raw_data(self):
+        """
+        Uses the data loader to read the raw data from a specified location
+        and stores the data in the data location.
+        TODO: schema
+        """
+        self.data = self.data_loader.load()
+
+    def apply_representation(self, representation):
+        new_modality = TransformedModality(self.type, representation)
+        new_modality.data = []
+
+        if self.data_loader.chunk_size:
+            while self.data_loader.next_chunk < self.data_loader.num_chunks:
+                self.extract_raw_data()
+                new_modality.data.extend(representation.transform(self.data))
+        else:
+            if not self.data:
+                self.extract_raw_data()
+            new_modality.data = representation.transform(self.data)
+
+        return new_modality
diff --git a/src/main/python/systemds/scuro/modality/video_modality.py b/src/main/python/systemds/scuro/modality/video_modality.py
deleted file mode 100644
index a6cedf6c86a..00000000000
--- a/src/main/python/systemds/scuro/modality/video_modality.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# -------------------------------------------------------------
-import os
-
-from systemds.scuro.modality.modality import Modality
-from systemds.scuro.representations.unimodal import UnimodalRepresentation
-
-
-class VideoModality(Modality):
-    def __init__(
-        self,
-        file_path: str,
-        representation: UnimodalRepresentation,
-        train_indices=None,
-        start_index: int = 0,
-    ):
-        """
-        Creates a video modality
-        :param file_path: path to file where the video embeddings (for now) are stored
-        :param representation: Unimodal representation that indicates how to extract the data from the file
-        """
-        super().__init__(representation, start_index, "Video", train_indices)
-        self.file_path = file_path
-
-    def file_sanity_check(self):
-        """
-        Checks if the file can be found is not empty
-        """
-        try:
-            file_size = os.path.getsize(self.file_path)
-        except:
-            raise (f"Error: File {0} not found!".format(self.file_path))
-
-        if file_size == 0:
-            raise ("File {0} is empty".format(self.file_path))
-
-    def read_chunk(self):
-        pass
-
-    def read_all(self, indices=None):
-        self.data = self.representation.parse_all(
-            self.file_path, indices=indices
-        )  # noqa
diff --git a/src/main/python/systemds/scuro/representations/average.py b/src/main/python/systemds/scuro/representations/average.py
index d10778f1136..db44050e9e0 100644
--- a/src/main/python/systemds/scuro/representations/average.py
+++ b/src/main/python/systemds/scuro/representations/average.py
@@ -36,7 +36,7 @@ def __init__(self):
         """
         super().__init__("Average")
 
-    def fuse(self, modalities: List[Modality]):
+    def transform(self, modalities: List[Modality]):
         max_emb_size = self.get_max_embedding_size(modalities)
 
         padded_modalities = []
diff --git a/src/main/python/systemds/scuro/representations/bert.py b/src/main/python/systemds/scuro/representations/bert.py
index 85d0b1ad658..0fcf1e8d280 100644
--- a/src/main/python/systemds/scuro/representations/bert.py
+++ b/src/main/python/systemds/scuro/representations/bert.py
@@ -24,7 +24,7 @@
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 import torch
 from transformers import BertTokenizer, BertModel
-from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
 
 
 class Bert(UnimodalRepresentation):
@@ -34,8 +34,7 @@ def __init__(self, avg_layers=None, output_file=None):
         self.avg_layers = avg_layers
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
-        data = read_data_from_file(filepath, indices)
+    def transform(self, data):
 
         model_name = "bert-base-uncased"
         tokenizer = BertTokenizer.from_pretrained(
@@ -47,13 +46,10 @@ def parse_all(self, filepath, indices):
         else:
             model = BertModel.from_pretrained(model_name)
 
-        embeddings = self.create_embeddings(list(data.values()), model, tokenizer)
+        embeddings = self.create_embeddings(data, model, tokenizer)
 
         if self.output_file is not None:
-            data = {}
-            for i in range(0, embeddings.shape[0]):
-                data[indices[i]] = embeddings[i]
-            save_embeddings(data, self.output_file)
+            save_embeddings(embeddings, self.output_file)
 
         return embeddings
 
@@ -75,8 +71,5 @@ def create_embeddings(self, data, model, tokenizer):
                 cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
             embeddings.append(cls_embedding)
 
-        if self.output_file is not None:
-            save_embeddings(embeddings, self.output_file)
-
         embeddings = np.array(embeddings)
         return embeddings.reshape((embeddings.shape[0], embeddings.shape[-1]))
diff --git a/src/main/python/systemds/scuro/representations/bow.py b/src/main/python/systemds/scuro/representations/bow.py
index dc5013b3540..bd54654a5cb 100644
--- a/src/main/python/systemds/scuro/representations/bow.py
+++ b/src/main/python/systemds/scuro/representations/bow.py
@@ -19,11 +19,10 @@
 #
 # -------------------------------------------------------------
 
-import pandas as pd
 from sklearn.feature_extraction.text import CountVectorizer
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
-from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
 
 
 class BoW(UnimodalRepresentation):
@@ -33,19 +32,14 @@ def __init__(self, ngram_range, min_df, output_file=None):
         self.min_df = min_df
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
+    def transform(self, data):
         vectorizer = CountVectorizer(
             ngram_range=(1, self.ngram_range), min_df=self.min_df
         )
 
-        segments = read_data_from_file(filepath, indices)
-        X = vectorizer.fit_transform(segments.values())
-        X = X.toarray()
+        X = vectorizer.fit_transform(data).toarray()
 
         if self.output_file is not None:
-            df = pd.DataFrame(X)
-            df.index = segments.keys()
-
-            save_embeddings(df, self.output_file)
+            save_embeddings(X, self.output_file)
 
         return X
diff --git a/src/main/python/systemds/scuro/representations/concatenation.py b/src/main/python/systemds/scuro/representations/concatenation.py
index 7694fa68977..fd9293d3997 100644
--- a/src/main/python/systemds/scuro/representations/concatenation.py
+++ b/src/main/python/systemds/scuro/representations/concatenation.py
@@ -37,7 +37,7 @@ def __init__(self, padding=True):
         super().__init__("Concatenation")
         self.padding = padding
 
-    def fuse(self, modalities: List[Modality]):
+    def transform(self, modalities: List[Modality]):
         if len(modalities) == 1:
             return np.array(modalities[0].data)
 
diff --git a/src/main/python/systemds/scuro/representations/fusion.py b/src/main/python/systemds/scuro/representations/fusion.py
index e84e59f6663..623979dd052 100644
--- a/src/main/python/systemds/scuro/representations/fusion.py
+++ b/src/main/python/systemds/scuro/representations/fusion.py
@@ -32,7 +32,7 @@ def __init__(self, name):
         """
         super().__init__(name)
 
-    def fuse(self, modalities: List[Modality]):
+    def transform(self, modalities: List[Modality]):
         """
         Implemented for every child class and creates a fused representation out of
         multiple modalities
diff --git a/src/main/python/systemds/scuro/representations/glove.py b/src/main/python/systemds/scuro/representations/glove.py
index 840360540e5..cf13c717d2f 100644
--- a/src/main/python/systemds/scuro/representations/glove.py
+++ b/src/main/python/systemds/scuro/representations/glove.py
@@ -18,7 +18,6 @@
 # under the License.
 #
 # -------------------------------------------------------------
-import nltk
 import numpy as np
 from nltk import word_tokenize
 
@@ -43,24 +42,24 @@ def __init__(self, glove_path, output_file=None):
         self.glove_path = glove_path
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
+    def transform(self, data):
         glove_embeddings = load_glove_embeddings(self.glove_path)
-        segments = read_data_from_file(filepath, indices)
 
-        embeddings = {}
-        for k, v in segments.items():
-            tokens = word_tokenize(v.lower())
-            embeddings[k] = np.mean(
-                [
-                    glove_embeddings[token]
-                    for token in tokens
-                    if token in glove_embeddings
-                ],
-                axis=0,
+        embeddings = []
+        for sentences in data:
+            tokens = word_tokenize(sentences.lower())
+            embeddings.append(
+                np.mean(
+                    [
+                        glove_embeddings[token]
+                        for token in tokens
+                        if token in glove_embeddings
+                    ],
+                    axis=0,
+                )
             )
 
         if self.output_file is not None:
-            save_embeddings(embeddings, self.output_file)
+            save_embeddings(np.array(embeddings), self.output_file)
 
-        embeddings = np.array(list(embeddings.values()))
-        return embeddings
+        return np.array(embeddings)
diff --git a/src/main/python/systemds/scuro/representations/lstm.py b/src/main/python/systemds/scuro/representations/lstm.py
index 3687ff65147..649b81117b2 100644
--- a/src/main/python/systemds/scuro/representations/lstm.py
+++ b/src/main/python/systemds/scuro/representations/lstm.py
@@ -40,17 +40,17 @@ def __init__(self, width=128, depth=1, dropout_rate=0.1):
         self.dropout_rate = dropout_rate
         self.unimodal_embeddings = {}
 
-    def fuse(self, modalities: List[Modality], train_indices=None):
+    def transform(self, modalities: List[Modality]):
         size = len(modalities[0].data)
 
         result = np.zeros((size, 0))
 
         for modality in modalities:
-            if modality.name in self.unimodal_embeddings.keys():
-                out = self.unimodal_embeddings.get(modality.name)
+            if modality.type in self.unimodal_embeddings.keys():
+                out = self.unimodal_embeddings.get(modality.type)
             else:
                 out = self.run_lstm(modality.data)
-                self.unimodal_embeddings[modality.name] = out
+                self.unimodal_embeddings[modality.type] = out
 
             result = np.concatenate([result, out], axis=-1)
 
diff --git a/src/main/python/systemds/scuro/representations/max.py b/src/main/python/systemds/scuro/representations/max.py
index 2e1e8644773..194b20801e0 100644
--- a/src/main/python/systemds/scuro/representations/max.py
+++ b/src/main/python/systemds/scuro/representations/max.py
@@ -38,7 +38,7 @@ def __init__(self, split=4):
         super().__init__("RowMax")
         self.split = split
 
-    def fuse(
+    def transform(
         self,
         modalities: List[Modality],
     ):
diff --git a/src/main/python/systemds/scuro/representations/mel_spectrogram.py b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
index 27aba8b997d..57a7fab83e2 100644
--- a/src/main/python/systemds/scuro/representations/mel_spectrogram.py
+++ b/src/main/python/systemds/scuro/representations/mel_spectrogram.py
@@ -19,7 +19,6 @@
 #
 # -------------------------------------------------------------
 
-import os
 import pickle
 
 import librosa
@@ -35,19 +34,15 @@ def __init__(self, avg=True, output_file=None):
         self.avg = avg
         self.output_file = output_file
 
-    def parse_all(self, file_path, indices, get_sequences=False):
+    def transform(self, data):
         result = []
         max_length = 0
-        if os.path.isdir(file_path):
-            for filename in os.listdir(file_path):
-                f = os.path.join(file_path, filename)
-                if os.path.isfile(f):
-                    y, sr = librosa.load(f)
-                    S = librosa.feature.melspectrogram(y=y, sr=sr)
-                    S_dB = librosa.power_to_db(S, ref=np.max)
-                    if S_dB.shape[-1] > max_length:
-                        max_length = S_dB.shape[-1]
-                    result.append(S_dB)
+        for sample in data:
+            S = librosa.feature.melspectrogram(y=sample)
+            S_dB = librosa.power_to_db(S, ref=np.max)
+            if S_dB.shape[-1] > max_length:
+                max_length = S_dB.shape[-1]
+            result.append(S_dB)
 
         r = []
         for elem in result:
@@ -57,9 +52,9 @@ def parse_all(self, file_path, indices, get_sequences=False):
         np_array_r = np.array(r) if not self.avg else np.mean(np.array(r), axis=1)
 
         if self.output_file is not None:
-            data = {}
+            data = []
             for i in range(0, np_array_r.shape[0]):
-                data[indices[i]] = np_array_r[i]
+                data.append(np_array_r[i])
             with open(self.output_file, "wb") as file:
                 pickle.dump(data, file)
 
diff --git a/src/main/python/systemds/scuro/representations/multiplication.py b/src/main/python/systemds/scuro/representations/multiplication.py
index 18f34bae6f9..2934fe5b3c9 100644
--- a/src/main/python/systemds/scuro/representations/multiplication.py
+++ b/src/main/python/systemds/scuro/representations/multiplication.py
@@ -36,7 +36,7 @@ def __init__(self):
         """
         super().__init__("Multiplication")
 
-    def fuse(self, modalities: List[Modality], train_indices=None):
+    def transform(self, modalities: List[Modality], train_indices=None):
         max_emb_size = self.get_max_embedding_size(modalities)
 
         data = pad_sequences(modalities[0].data, maxlen=max_emb_size, dtype="float32")
diff --git a/src/main/python/systemds/scuro/representations/resnet.py b/src/main/python/systemds/scuro/representations/resnet.py
index 75c921184b3..1c1bfa1d5ec 100644
--- a/src/main/python/systemds/scuro/representations/resnet.py
+++ b/src/main/python/systemds/scuro/representations/resnet.py
@@ -25,8 +25,6 @@
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
 from typing import Callable, Dict, Tuple, Any
 import torch.utils.data
-import os
-import cv2
 import torch
 import torchvision.models as models
 import torchvision.transforms as transforms
@@ -36,22 +34,25 @@
 
 
 class ResNet(UnimodalRepresentation):
-    def __init__(self, output_file=None):
+    def __init__(self, layer="avgpool", output_file=None):
         super().__init__("ResNet")
 
         self.output_file = output_file
+        self.layer_name = layer
 
-    def parse_all(self, file_path, indices, get_sequences=False):
-        resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT)
+    def transform(self, data):
+
+        resnet = models.resnet152(weights=models.ResNet152_Weights.DEFAULT).to(DEVICE)
         resnet.eval()
 
         for param in resnet.parameters():
             param.requires_grad = False
 
-        transform = transforms.Compose(
+        t = transforms.Compose(
             [
                 transforms.ToPILImage(),
-                transforms.Resize((224, 224)),
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
                 transforms.ToTensor(),
                 transforms.Normalize(
                     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
@@ -59,7 +60,7 @@ def parse_all(self, file_path, indices, get_sequences=False):
             ]
         )
 
-        dataset = ResNetDataset(transform=transform, video_folder_path=file_path)
+        dataset = ResNetDataset(data, t)
         embeddings = {}
 
         class Identity(torch.nn.Module):
@@ -70,108 +71,72 @@ def forward(self, input_: torch.Tensor) -> torch.Tensor:
 
         res5c_output = None
 
-        def avg_pool_hook(
-            _module: torch.nn.Module, input_: Tuple[torch.Tensor], _output: Any
-        ) -> None:
-            nonlocal res5c_output
-            res5c_output = input_[0]
+        def get_features(name_):
+            def hook(
+                _module: torch.nn.Module, input_: Tuple[torch.Tensor], output: Any
+            ):
+                nonlocal res5c_output
+                res5c_output = output
+
+            return hook
 
-        resnet.avgpool.register_forward_hook(avg_pool_hook)
+        if self.layer_name:
+            for name, layer in resnet.named_modules():
+                if name == self.layer_name:
+                    layer.register_forward_hook(get_features(name))
+                    break
 
         for instance in torch.utils.data.DataLoader(dataset):
             video_id = instance["id"][0]
             frames = instance["frames"][0].to(DEVICE)
-            embeddings[video_id] = torch.empty((len(frames), 2048))
-            batch_size = 32
+            embeddings[video_id] = []
+            batch_size = 64
+
             for start_index in range(0, len(frames), batch_size):
                 end_index = min(start_index + batch_size, len(frames))
                 frame_ids_range = range(start_index, end_index)
                 frame_batch = frames[frame_ids_range]
 
-                avg_pool_value = resnet(frame_batch)
+                _ = resnet(frame_batch)
+                values = res5c_output
+
+                if self.layer_name == "avgpool" or self.layer_name == "maxpool":
+                    embeddings[video_id].extend(
+                        torch.flatten(values, 1).detach().cpu().numpy()
+                    )
 
-                embeddings[video_id][frame_ids_range] = avg_pool_value.to(DEVICE)
+                else:
+                    pooled = torch.nn.functional.adaptive_avg_pool2d(values, (1, 1))
+
+                    embeddings[video_id].extend(
+                        torch.flatten(pooled, 1).detach().cpu().numpy()
+                    )
 
         if self.output_file is not None:
             with h5py.File(self.output_file, "w") as hdf:
                 for key, value in embeddings.items():
                     hdf.create_dataset(key, data=value)
 
-        emb = np.zeros((len(indices), 2048), dtype="float32")
-        if indices is not None:
-            for i in indices:
-                emb[i] = embeddings.get(str(i)).mean(dim=0).numpy()
-        else:
-            for i, key in enumerate(embeddings.keys()):
-                emb[i] = embeddings.get(key).mean(dim=0).numpy()
-
-        return emb
-
-    @staticmethod
-    def extract_features_from_video(video_path, model, transform):
-        cap = cv2.VideoCapture(video_path)
-        features = []
-        count = 0
-        success, frame = cap.read()
-
-        while success:
-            success, frame = cap.read()
-            transformed_frame = transform(frame).unsqueeze(0)
+        emb = []
 
-            with torch.no_grad():
-                feature_vector = model(transformed_frame)
-                feature_vector = feature_vector.view(-1).numpy()
+        for video in embeddings.values():
+            emb.append(np.array(video).mean(axis=0).tolist())
 
-            features.append(feature_vector)
-
-            count += 1
-
-        cap.release()
-        return features, count
+        return np.array(emb)
 
 
 class ResNetDataset(torch.utils.data.Dataset):
-    def __init__(self, video_folder_path: str, transform: Callable = None):
-        self.video_folder_path = video_folder_path
-        self.transform = transform
-        self.video_ids = []
-        video_files = [
-            f
-            for f in os.listdir(self.video_folder_path)
-            if f.lower().endswith((".mp4", ".avi", ".mov", ".mkv"))
-        ]
-        self.file_extension = video_files[0].split(".")[-1]
-
-        for video in video_files:
-            video_id, _ = video.split("/")[-1].split(".")
-            self.video_ids.append(video_id)
-
-        self.frame_count_by_video_id = {video_id: 0 for video_id in self.video_ids}
+    def __init__(self, data: str, tf: Callable = None):
+        self.data = data
+        self.tf = tf
 
     def __getitem__(self, index) -> Dict[str, object]:
-        video_id = self.video_ids[index]
-        video_path = self.video_folder_path + "/" + video_id + "." + self.file_extension
-
-        frames = None
-        count = 0
-
-        cap = cv2.VideoCapture(video_path)
-
-        success, frame = cap.read()
-
-        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-        self.frame_count_by_video_id[video_id] = num_frames
-        if frames is None and success:
-            frames = torch.empty((num_frames, 3, 224, 224))
-
-        while success:
-            frame = self.transform(frame)
-            frames[count] = frame  # noqa
-            success, frame = cap.read()
-            count += 1
+        video = self.data[index]
+        frames = torch.empty((len(video), 3, 224, 224))
 
-        cap.release()
-        return {"id": video_id, "frames": frames}
+        for i, frame in enumerate(video):
+            frames[i] = self.tf(frame)
+        return {"id": index, "frames": frames}
 
     def __len__(self) -> int:
-        return len(self.video_ids)
+        return len(self.data)
diff --git a/src/main/python/systemds/scuro/representations/rowmax.py b/src/main/python/systemds/scuro/representations/rowmax.py
index 0dc201e2ee1..31527820269 100644
--- a/src/main/python/systemds/scuro/representations/rowmax.py
+++ b/src/main/python/systemds/scuro/representations/rowmax.py
@@ -23,10 +23,10 @@
 
 import numpy as np
 
-from modality.modality import Modality
+from systemds.scuro.modality.modality import Modality
 from systemds.scuro.representations.utils import pad_sequences
 
-from representations.fusion import Fusion
+from systemds.scuro.representations.fusion import Fusion
 
 
 class RowMax(Fusion):
@@ -38,7 +38,7 @@ def __init__(self, split=1):
         super().__init__("RowMax")
         self.split = split
 
-    def fuse(self, modalities: List[Modality], train_indices):
+    def transform(self, modalities: List[Modality]):
         if len(modalities) < 2:
             return np.array(modalities)
 
@@ -46,8 +46,7 @@ def fuse(self, modalities: List[Modality], train_indices):
 
         padded_modalities = []
         for modality in modalities:
-            scaled = self.scale_data(modality.data, train_indices)
-            d = pad_sequences(scaled, maxlen=max_emb_size, dtype="float32")
+            d = pad_sequences(modality.data, maxlen=max_emb_size, dtype="float32")
             padded_modalities.append(d)
 
         split_rows = int(len(modalities[0].data) / self.split)
diff --git a/src/main/python/systemds/scuro/representations/sum.py b/src/main/python/systemds/scuro/representations/sum.py
index bfb19d4f7d6..0608338a0fd 100644
--- a/src/main/python/systemds/scuro/representations/sum.py
+++ b/src/main/python/systemds/scuro/representations/sum.py
@@ -35,7 +35,7 @@ def __init__(self):
         """
         super().__init__("Sum")
 
-    def fuse(self, modalities: List[Modality]):
+    def transform(self, modalities: List[Modality]):
         max_emb_size = self.get_max_embedding_size(modalities)
 
         data = pad_sequences(modalities[0].data, maxlen=max_emb_size, dtype="float32")
diff --git a/src/main/python/systemds/scuro/representations/tfidf.py b/src/main/python/systemds/scuro/representations/tfidf.py
index 15515dd5386..4849aba1360 100644
--- a/src/main/python/systemds/scuro/representations/tfidf.py
+++ b/src/main/python/systemds/scuro/representations/tfidf.py
@@ -19,7 +19,6 @@
 #
 # -------------------------------------------------------------
 
-import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
@@ -32,17 +31,13 @@ def __init__(self, min_df, output_file=None):
         self.min_df = min_df
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
+    def transform(self, data):
         vectorizer = TfidfVectorizer(min_df=self.min_df)
 
-        segments = read_data_from_file(filepath, indices)
-        X = vectorizer.fit_transform(segments.values())
+        X = vectorizer.fit_transform(data)
         X = X.toarray()
 
         if self.output_file is not None:
-            df = pd.DataFrame(X)
-            df.index = segments.keys()
-
-            save_embeddings(df, self.output_file)
+            save_embeddings(X, self.output_file)
 
         return X
diff --git a/src/main/python/systemds/scuro/representations/unimodal.py b/src/main/python/systemds/scuro/representations/unimodal.py
index ccd61977654..c56d611a744 100644
--- a/src/main/python/systemds/scuro/representations/unimodal.py
+++ b/src/main/python/systemds/scuro/representations/unimodal.py
@@ -29,7 +29,7 @@ def __init__(self, name):
         """
         super().__init__(name)
 
-    def parse_all(self, file_path, indices):
+    def transform(self, data):
         raise f"Not implemented for {self.name}"
 
 
diff --git a/src/main/python/systemds/scuro/representations/word2vec.py b/src/main/python/systemds/scuro/representations/word2vec.py
index cc8a180889d..209091648d5 100644
--- a/src/main/python/systemds/scuro/representations/word2vec.py
+++ b/src/main/python/systemds/scuro/representations/word2vec.py
@@ -21,10 +21,9 @@
 import numpy as np
 
 from systemds.scuro.representations.unimodal import UnimodalRepresentation
-from systemds.scuro.representations.utils import read_data_from_file, save_embeddings
+from systemds.scuro.representations.utils import save_embeddings
 from gensim.models import Word2Vec
 from nltk.tokenize import word_tokenize
-import nltk
 
 
 def get_embedding(sentence, model):
@@ -44,22 +43,20 @@ def __init__(self, vector_size, min_count, window, output_file=None):
         self.window = window
         self.output_file = output_file
 
-    def parse_all(self, filepath, indices):
-        segments = read_data_from_file(filepath, indices)
-        embeddings = {}
-        t = [word_tokenize(s.lower()) for s in segments.values()]
+    def transform(self, data):
+        t = [word_tokenize(s.lower()) for s in data]
         model = Word2Vec(
             sentences=t,
             vector_size=self.vector_size,
             window=self.window,
             min_count=self.min_count,
         )
-
-        for k, v in segments.items():
-            tokenized_words = word_tokenize(v.lower())
-            embeddings[k] = get_embedding(tokenized_words, model)
+        embeddings = []
+        for sentences in data:
+            tokens = word_tokenize(sentences.lower())
+            embeddings.append(get_embedding(tokens, model))
 
         if self.output_file is not None:
-            save_embeddings(embeddings, self.output_file)
+            save_embeddings(np.array(embeddings), self.output_file)
 
-        return np.array(list(embeddings.values()))
+        return np.array(embeddings)
diff --git a/src/main/python/tests/scuro/data_generator.py b/src/main/python/tests/scuro/data_generator.py
index 9f5b8dd2d73..7f8fc80aeb2 100644
--- a/src/main/python/tests/scuro/data_generator.py
+++ b/src/main/python/tests/scuro/data_generator.py
@@ -23,10 +23,7 @@
 from scipy.io.wavfile import write
 import random
 import os
-
-from systemds.scuro.modality.video_modality import VideoModality
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
+from systemds.scuro.modality.type import ModalityType
 
 
 class TestDataGenerator:
@@ -36,7 +33,7 @@ def __init__(self, modalities, path, balanced=True):
         self.balanced = balanced
 
         for modality in modalities:
-            mod_path = f"{self.path}/{modality.name.lower()}/"
+            mod_path = f"{self.path}/{modality.type.name}/"
             os.mkdir(mod_path)
             modality.file_path = mod_path
         self.labels = []
@@ -72,11 +69,11 @@ def create_multimodal_data(self, num_instances, duration=2, seed=42):
                 speed_slow += 1
 
             for modality in self.modalities:
-                if isinstance(modality, VideoModality):
+                if modality.type == ModalityType.VIDEO:
                     self.__create_video_data(idx, duration, 30, speed_factor)
-                if isinstance(modality, AudioModality):
+                if modality.type == ModalityType.AUDIO:
                     self.__create_audio_data(idx, duration, speed_factor)
-                if isinstance(modality, TextModality):
+                if modality.type == ModalityType.TEXT:
                     self.__create_text_data(idx, speed_factor)
 
         np.save(f"{self.path}/labels.npy", np.array(self.labels))
diff --git a/src/main/python/tests/scuro/test_data_loaders.py b/src/main/python/tests/scuro/test_data_loaders.py
index cbbeafab8a6..5253b6dbc8a 100644
--- a/src/main/python/tests/scuro/test_data_loaders.py
+++ b/src/main/python/tests/scuro/test_data_loaders.py
@@ -22,15 +22,17 @@
 import os
 import shutil
 import unittest
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
 from systemds.scuro.representations.bert import Bert
 from systemds.scuro.representations.mel_spectrogram import MelSpectrogram
 from systemds.scuro.representations.resnet import ResNet
-from systemds.scuro.representations.representation_dataloader import HDF5, NPY, Pickle
 from tests.scuro.data_generator import TestDataGenerator
 
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.modality.type import ModalityType
+
 
 class TestDataLoaders(unittest.TestCase):
     test_file_path = None
@@ -53,28 +55,22 @@ def setUpClass(cls):
 
         cls.num_instances = 2
         cls.indizes = [str(i) for i in range(0, cls.num_instances)]
-        cls.video = VideoModality(
-            "", ResNet(f"{cls.test_file_path}/embeddings/resnet_embeddings.hdf5")
-        )
-        cls.audio = AudioModality(
-            "",
-            MelSpectrogram(
-                output_file=f"{cls.test_file_path}/embeddings/mel_sp_embeddings.npy"
-            ),
-        )
-        cls.text = TextModality(
-            "",
-            Bert(
-                avg_layers=4,
-                output_file=f"{cls.test_file_path}/embeddings/bert_embeddings.pkl",
-            ),
-        )
-        cls.mods = [cls.video, cls.audio, cls.text]
+
+        video_data_loader = VideoLoader(cls.test_file_path + "/video/", cls.indizes)
+        audio_data_loader = AudioLoader(cls.test_file_path + "/audio/", cls.indizes)
+        text_data_loader = TextLoader(cls.test_file_path + "/text/", cls.indizes)
+
+        # Load modalities (audio, video, text)
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+
+        cls.mods = [video, audio, text]
         cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path)
         cls.data_generator.create_multimodal_data(cls.num_instances)
-        cls.text.read_all(cls.indizes)
-        cls.audio.read_all(cls.indizes)
-        cls.video.read_all([i for i in range(0, cls.num_instances)])
+        cls.text_ref = text.apply_representation(Bert())
+        cls.audio_ref = audio.apply_representation(MelSpectrogram())
+        cls.video_ref = video.apply_representation(ResNet())
 
     @classmethod
     def tearDownClass(cls):
@@ -82,35 +78,31 @@ def tearDownClass(cls):
         shutil.rmtree(cls.test_file_path)
 
     def test_load_audio_data_from_file(self):
-        load_audio = AudioModality(
-            f"{self.test_file_path}/embeddings/mel_sp_embeddings.npy", NPY()
-        )
-        load_audio.read_all(self.indizes)
+        audio_data_loader = AudioLoader(self.test_file_path + "/audio/", self.indizes)
+        audio = UnimodalModality(
+            audio_data_loader, ModalityType.AUDIO
+        ).apply_representation(MelSpectrogram())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.audio.data[i]), 4) == round(
-                sum(load_audio.data[i]), 4
-            )
+            assert round(sum(self.audio_ref.data[i]), 4) == round(sum(audio.data[i]), 4)
 
     def test_load_video_data_from_file(self):
-        load_video = VideoModality(
-            f"{self.test_file_path}/embeddings/resnet_embeddings.hdf5", HDF5()
-        )
-        load_video.read_all(self.indizes)
+        video_data_loader = VideoLoader(self.test_file_path + "/video/", self.indizes)
+        video = UnimodalModality(
+            video_data_loader, ModalityType.VIDEO
+        ).apply_representation(ResNet())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.video.data[i]), 4) == round(
-                sum(load_video.data[i]), 4
-            )
+            assert round(sum(self.video_ref.data[i]), 4) == round(sum(video.data[i]), 4)
 
     def test_load_text_data_from_file(self):
-        load_text = TextModality(
-            f"{self.test_file_path}/embeddings/bert_embeddings.pkl", Pickle()
-        )
-        load_text.read_all(self.indizes)
+        text_data_loader = TextLoader(self.test_file_path + "/text/", self.indizes)
+        text = UnimodalModality(
+            text_data_loader, ModalityType.TEXT
+        ).apply_representation(Bert())
 
         for i in range(0, self.num_instances):
-            assert round(sum(self.text.data[i]), 4) == round(sum(load_text.data[i]), 4)
+            assert round(sum(self.text_ref.data[i]), 4) == round(sum(text.data[i]), 4)
 
 
 if __name__ == "__main__":
diff --git a/src/main/python/tests/scuro/test_dr_search.py b/src/main/python/tests/scuro/test_dr_search.py
index eac4a776413..f2778884339 100644
--- a/src/main/python/tests/scuro/test_dr_search.py
+++ b/src/main/python/tests/scuro/test_dr_search.py
@@ -28,11 +28,13 @@
 from sklearn.model_selection import train_test_split, KFold
 from sklearn.preprocessing import MinMaxScaler
 
+from systemds.scuro.modality.unimodal_modality import UnimodalModality
+from systemds.scuro.modality.type import ModalityType
+from systemds.scuro.dataloader.text_loader import TextLoader
+from systemds.scuro.dataloader.audio_loader import AudioLoader
+from systemds.scuro.dataloader.video_loader import VideoLoader
 from systemds.scuro.aligner.dr_search import DRSearch
 from systemds.scuro.aligner.task import Task
-from systemds.scuro.modality.audio_modality import AudioModality
-from systemds.scuro.modality.text_modality import TextModality
-from systemds.scuro.modality.video_modality import VideoModality
 from systemds.scuro.models.model import Model
 from systemds.scuro.representations.average import Average
 from systemds.scuro.representations.bert import Bert
@@ -101,28 +103,20 @@ def setUpClass(cls):
 
         cls.num_instances = 8
         cls.indizes = [str(i) for i in range(0, cls.num_instances)]
-        cls.video = VideoModality(
-            "", ResNet(f"{cls.test_file_path}/embeddings/resnet_embeddings.hdf5")
-        )
-        cls.audio = AudioModality(
-            "",
-            MelSpectrogram(
-                output_file=f"{cls.test_file_path}/embeddings/mel_sp_embeddings.npy"
-            ),
-        )
-        cls.text = TextModality(
-            "",
-            Bert(
-                avg_layers=4,
-                output_file=f"{cls.test_file_path}/embeddings/bert_embeddings.pkl",
-            ),
-        )
-        cls.mods = [cls.video, cls.audio, cls.text]
-        cls.data_generator = TestDataGenerator(cls.mods, cls.test_file_path)
+        video_data_loader = VideoLoader(cls.test_file_path + "/video/", cls.indizes)
+        audio_data_loader = AudioLoader(cls.test_file_path + "/audio/", cls.indizes)
+        text_data_loader = TextLoader(cls.test_file_path + "/text/", cls.indizes)
+        video = UnimodalModality(video_data_loader, ModalityType.VIDEO)
+        audio = UnimodalModality(audio_data_loader, ModalityType.AUDIO)
+        text = UnimodalModality(text_data_loader, ModalityType.TEXT)
+        cls.data_generator = TestDataGenerator([video, audio, text], cls.test_file_path)
         cls.data_generator.create_multimodal_data(cls.num_instances)
-        cls.text.read_all(cls.indizes)
-        cls.audio.read_all(cls.indizes)
-        cls.video.read_all([i for i in range(0, cls.num_instances)])
+
+        cls.bert = text.apply_representation(Bert())
+        cls.mel_spe = audio.apply_representation(MelSpectrogram())
+        cls.resnet = video.apply_representation(ResNet())
+
+        cls.mods = [cls.bert, cls.mel_spe, cls.resnet]
 
         split = train_test_split(
             cls.indizes, cls.data_generator.labels, test_size=0.2, random_state=42