diff --git a/cvat/apps/dataset_manager/formats/LibriVox.py b/cvat/apps/dataset_manager/formats/LibriVox.py
index 55680859bab8..a8a5924c58ed 100644
--- a/cvat/apps/dataset_manager/formats/LibriVox.py
+++ b/cvat/apps/dataset_manager/formats/LibriVox.py
@@ -1,12 +1,62 @@
+import os
 import os.path as osp
 import zipfile
+import csv
+from django.db import transaction
 from glob import glob
+from pydub import AudioSegment
 from cvat.apps.dataset_manager.bindings import InstanceLabelData
 from cvat.apps.engine.serializers import LabeledDataSerializer
 import cvat.apps.dataset_manager as dm
 from cvat.apps.dataset_manager.task import PatchAction
 from .registry import importer
-from cvat.apps.engine.models import Task, Job
+from cvat.apps.engine.models import Job, Task, Data
+from cvat.apps.engine.task import _create_thread
+from cvat.apps.dataset_manager.bindings import ProjectData
+
+
+def calculate_duration(row):
+    start_time = float(row["start"])  # Assuming start and end times are in seconds
+    end_time = float(row["end"])
+    return end_time - start_time
+
+
+def split_rows_by_time(all_rows, time_threshold=600):
+    result = []
+
+    total_duration = 0
+
+    for row in all_rows:
+        start_time = float(row["start"])
+        end_time = float(row["end"])
+        duration = end_time - start_time
+
+        total_duration += duration
+
+        if total_duration > time_threshold:
+            # split logic here
+            total_duration_till_previous_row = total_duration - duration
+            remaining_time = time_threshold - total_duration_till_previous_row
+
+            first_part = row.copy()
+            first_part["end"] = str(float(first_part["start"]) + remaining_time)
+
+            second_part = row.copy()
+            second_part["start"] = first_part["end"]
+
+            result.append(first_part)
+            result.append(second_part)
+
+            second_part_duration = float(second_part["end"]) - float(
+                second_part["start"]
+            )
+
+            total_duration = second_part_duration
+
+        else:
+            result.append(row)
+
+    return result
 
 
 def load_anno(file_object, annotations):
@@ -39,22 +89,6 @@ def load_anno(file_object, annotations):
         label_name = record.get("label")
         label_id = label_data._get_label_id(label_name)
 
-        language_id_to_locale_mapping = {
-            0: "en-US",
-            1: "es-ES",
-            2: "fr-FR",
-            3: "zh-CN",
-            4: "hi-IN",
-            5: "ar-EG",
-            6: "pt-BR",
-            7: "ja-JP",
-            8: "de-DE",
-            9: "ru-RU",
-        }
-
-        # defaults to -1 if language field not in tsv, locale will be an empty string
-        language_id = int(record.get("language", -1))
-
         attributes = []
 
         for i in range(1, len(headers)):
@@ -76,6 +110,24 @@ def load_anno(file_object, annotations):
                         }
                     )
 
+        language_id_to_locale_mapping = {
+            0: "en-US",
+            1: "es-ES",
+            2: "fr-FR",
+            3: "zh-CN",
+            4: "hi-IN",
+            5: "ar-EG",
+            6: "pt-BR",
+            7: "ja-JP",
+            8: "de-DE",
+            9: "ru-RU",
+        }
+
+        # defaults to -1 if language field not in tsv, locale will be an empty string
+        language_id = (
+            int(float(record.get("language", -1))) if record.get("language") else -1
+        )
+
         shapes_data = [
             {
                 "type": "rectangle",
@@ -112,9 +164,209 @@ def load_anno(file_object, annotations):
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
     is_zip = zipfile.is_zipfile(src_file)
     src_file.seek(0)
+    file_name = os.path.basename(src_file.name)
+    name_without_extension = os.path.splitext(file_name)[0]
+
     if is_zip:
         zipfile.ZipFile(src_file).extractall(temp_dir)
 
-        anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
-        for p in anno_paths:
-            load_anno(p, instance_data)
+        if isinstance(instance_data, ProjectData):
+            project = instance_data.db_project
+            new_task = Task.objects.create(
+                project=project,
+                name=name_without_extension,
+                segment_size=0,
+            )
+            new_task.save()
+
+            with transaction.atomic():
+                locked_instance = Task.objects.select_for_update().get(pk=new_task.id)
+                task_data = locked_instance.data
+                if not task_data:
+                    task_data = Data.objects.create()
+                    task_data.make_dirs()
+                    locked_instance.data = task_data
+                    locked_instance.save()
+
+            clips_folder = os.path.join(temp_dir, "clips")
+            tsv_file_path = os.path.join(temp_dir, "data.tsv")
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                tsv_rows = list(reader)
+
+                num_tsv_rows = len(tsv_rows)
+                num_clips = len(os.listdir(clips_folder))
+
+                if num_tsv_rows != num_clips:
+                    raise ValueError(
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                    )
+
+            # Combined audio that will be the final output
+            combined_audio = AudioSegment.empty()
+
+            # Read TSV file to get the ordered list of audio files
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+
+                for row in reader:
+                    audio_file_name = row[
+                        "file"
+                    ]  # Assuming 'file' column contains audio file names
+                    file_path = os.path.join(clips_folder, audio_file_name)
+
+                    if os.path.isfile(file_path):
+                        audio_segment = AudioSegment.from_file(file_path)
+                        combined_audio += (
+                            audio_segment  # Append the audio in the order from TSV
+                        )
+
+            # Create raw folder to store combined audio
+            raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
+            os.makedirs(raw_folder_path, exist_ok=True)
+
+            combined_audio_path = os.path.join(raw_folder_path, "combined_audio.wav")
+            combined_audio.export(combined_audio_path, format="wav")
+
+            data = {
+                "chunk_size": None,
+                "image_quality": 70,
+                "start_frame": 0,
+                "stop_frame": None,
+                "frame_filter": "",
+                "client_files": ["combined_audio.wav"],
+                "server_files": [],
+                "remote_files": [],
+                "use_zip_chunks": False,
+                "server_files_exclude": [],
+                "use_cache": False,
+                "copy_data": False,
+                "storage_method": "file_system",
+                "storage": "local",
+                "sorting_method": "lexicographical",
+                "filename_pattern": None,
+            }
+
+            _create_thread(
+                locked_instance, data, is_task_import=True, temp_dir=temp_dir
+            )
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                all_rows = list(reader)
+
+                new_rows = split_rows_by_time(all_rows)
+
+            jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
+
+            label_data = InstanceLabelData(instance_data.db_project)
+
+            record_index = 0
+            for job in jobs:
+                start_time = 0
+
+                while record_index < len(new_rows):
+                    record = new_rows[record_index]
+
+                    record_duration = calculate_duration(record)
+
+                    end_time = start_time + record_duration
+
+                    label_name = record.get("label")
+                    label_id = label_data._get_label_id(label_name)
+
+                    attributes = []
+
+                    # Process dynamic attribute_i_name and attribute_i_value fields
+                    attribute_index = 1  # Start with the first attribute
+                    while True:
+                        attribute_name_key = f"attribute_{attribute_index}_name"
+                        attribute_value_key = f"attribute_{attribute_index}_value"
+
+                        # Check if the keys exist in the record
+                        if (
+                            attribute_name_key in record
+                            and attribute_value_key in record
+                        ):
+                            attribute_name = record.get(attribute_name_key)
+                            attribute_value = record.get(attribute_value_key)
+
+                            if attribute_name and attribute_value:
+                                spec_id = label_data._get_attribute_id(
+                                    label_id, attribute_name
+                                )
+                                attributes.append(
+                                    {
+                                        "spec_id": spec_id,
+                                        "value": attribute_value,
+                                    }
+                                )
+
+                            attribute_index += 1  # Move to the next attribute index
+                        else:
+                            break  # Exit the loop when no more attributes are found
+
+                    language_id_to_locale_mapping = {
+                        0: "en-US",
+                        1: "es-ES",
+                        2: "fr-FR",
+                        3: "zh-CN",
+                        4: "hi-IN",
+                        5: "ar-EG",
+                        6: "pt-BR",
+                        7: "ja-JP",
+                        8: "de-DE",
+                        9: "ru-RU",
+                    }
+
+                    # defaults to -1 if language field not in tsv, locale will be an empty string
+                    language_id = (
+                        int(float(record.get("language", -1)))
+                        if record.get("language")
+                        else -1
+                    )
+
+                    shapes_data = [
+                        {
+                            "type": "rectangle",
+                            "label": record.get("label", ""),
+                            "points": [start_time, start_time, end_time, end_time],
+                            "frame": 0,
+                            "occluded": False,
+                            "z_order": 0,
+                            "group": None,
+                            "source": "manual",
+                            "transcript": record.get("text", ""),
+                            "gender": record.get("gender", ""),
+                            "age": record.get("age", ""),
+                            "locale": language_id_to_locale_mapping.get(
+                                language_id, ""
+                            ),
+                            "accent": record.get("accent", ""),
+                            "emotion": record.get("emotion", ""),
+                            "rotation": 0.0,
+                            "label_id": label_id,
+                            "attributes": attributes,
+                        }
+                    ]
+
+                    data = {"shapes": shapes_data}
+                    start_time = end_time
+
+                    serializer = LabeledDataSerializer(data=data)
+                    pk = int(job.id)
+                    action = PatchAction.CREATE
+
+                    if serializer.is_valid(raise_exception=True):
+                        data = dm.task.patch_job_data(pk, serializer.data, action)
+
+                    record_index += 1
+                    total_duration = round(end_time, 2)
+                    if 599.9 <= total_duration <= 600:
+                        break
+
+        else:
+            anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
+            for p in anno_paths:
+                load_anno(p, instance_data)
diff --git a/cvat/apps/dataset_manager/formats/VCTK_Corpus.py b/cvat/apps/dataset_manager/formats/VCTK_Corpus.py
index 5057e16e9d6f..87e68cdec291 100644
--- a/cvat/apps/dataset_manager/formats/VCTK_Corpus.py
+++ b/cvat/apps/dataset_manager/formats/VCTK_Corpus.py
@@ -1,12 +1,62 @@
+import os
 import os.path as osp
 import zipfile
+import csv
+from django.db import transaction
 from glob import glob
+from pydub import AudioSegment
 from cvat.apps.dataset_manager.bindings import InstanceLabelData
 from cvat.apps.engine.serializers import LabeledDataSerializer
 import cvat.apps.dataset_manager as dm
 from cvat.apps.dataset_manager.task import PatchAction
 from .registry import importer
-from cvat.apps.engine.models import Task, Job
+from cvat.apps.engine.models import Job, Task, Data
+from cvat.apps.engine.task import _create_thread
+from cvat.apps.dataset_manager.bindings import ProjectData
+
+
+def calculate_duration(row):
+    start_time = float(row["start"])  # Assuming start and end times are in seconds
+    end_time = float(row["end"])
+    return end_time - start_time
+
+
+def split_rows_by_time(all_rows, time_threshold=600):
+    result = []
+
+    total_duration = 0
+
+    for row in all_rows:
+        start_time = float(row["start"])
+        end_time = float(row["end"])
+        duration = end_time - start_time
+
+        total_duration += duration
+
+        if total_duration > time_threshold:
+            # split logic here
+            total_duration_till_previous_row = total_duration - duration
+            remaining_time = time_threshold - total_duration_till_previous_row
+
+            first_part = row.copy()
+            first_part["end"] = str(float(first_part["start"]) + remaining_time)
+
+            second_part = row.copy()
+            second_part["start"] = first_part["end"]
+
+            result.append(first_part)
+            result.append(second_part)
+
+            second_part_duration = float(second_part["end"]) - float(
+                second_part["start"]
+            )
+
+            total_duration = second_part_duration
+
+        else:
+            result.append(row)
+
+    return result
 
 
 def load_anno(file_object, annotations):
@@ -39,22 +89,6 @@ def load_anno(file_object, annotations):
         label_name = record.get("label")
         label_id = label_data._get_label_id(label_name)
 
-        language_id_to_locale_mapping = {
-            0: "en-US",
-            1: "es-ES",
-            2: "fr-FR",
-            3: "zh-CN",
-            4: "hi-IN",
-            5: "ar-EG",
-            6: "pt-BR",
-            7: "ja-JP",
-            8: "de-DE",
-            9: "ru-RU",
-        }
-
-        # defaults to -1 if language field not in tsv, locale will be an empty string
-        language_id = int(record.get("language", -1))
-
         attributes = []
 
         for i in range(1, len(headers)):
@@ -76,6 +110,24 @@ def load_anno(file_object, annotations):
                         }
                     )
 
+        language_id_to_locale_mapping = {
+            0: "en-US",
+            1: "es-ES",
+            2: "fr-FR",
+            3: "zh-CN",
+            4: "hi-IN",
+            5: "ar-EG",
+            6: "pt-BR",
+            7: "ja-JP",
+            8: "de-DE",
+            9: "ru-RU",
+        }
+
+        # defaults to -1 if language field not in tsv, locale will be an empty string
+        language_id = (
+            int(float(record.get("language", -1))) if record.get("language") else -1
+        )
+
         shapes_data = [
             {
                 "type": "rectangle",
@@ -112,9 +164,209 @@ def load_anno(file_object, annotations):
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
     is_zip = zipfile.is_zipfile(src_file)
     src_file.seek(0)
+    file_name = os.path.basename(src_file.name)
+    name_without_extension = os.path.splitext(file_name)[0]
+
     if is_zip:
         zipfile.ZipFile(src_file).extractall(temp_dir)
 
-        anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
-        for p in anno_paths:
-            load_anno(p, instance_data)
+        if isinstance(instance_data, ProjectData):
+            project = instance_data.db_project
+            new_task = Task.objects.create(
+                project=project,
+                name=name_without_extension,
+                segment_size=0,
+            )
+            new_task.save()
+
+            with transaction.atomic():
+                locked_instance = Task.objects.select_for_update().get(pk=new_task.id)
+                task_data = locked_instance.data
+                if not task_data:
+                    task_data = Data.objects.create()
+                    task_data.make_dirs()
+                    locked_instance.data = task_data
+                    locked_instance.save()
+
+            clips_folder = os.path.join(temp_dir, "clips")
+            tsv_file_path = os.path.join(temp_dir, "data.tsv")
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                tsv_rows = list(reader)
+
+                num_tsv_rows = len(tsv_rows)
+                num_clips = len(os.listdir(clips_folder))
+
+                if num_tsv_rows != num_clips:
+                    raise ValueError(
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                    )
+
+            # Combined audio that will be the final output
+            combined_audio = AudioSegment.empty()
+
+            # Read TSV file to get the ordered list of audio files
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+
+                for row in reader:
+                    audio_file_name = row[
+                        "file"
+                    ]  # Assuming 'file' column contains audio file names
+                    file_path = os.path.join(clips_folder, audio_file_name)
+
+                    if os.path.isfile(file_path):
+                        audio_segment = AudioSegment.from_file(file_path)
+                        combined_audio += (
+                            audio_segment  # Append the audio in the order from TSV
+                        )
+
+            # Create raw folder to store combined audio
+            raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
+            os.makedirs(raw_folder_path, exist_ok=True)
+
+            combined_audio_path = os.path.join(raw_folder_path, "combined_audio.wav")
+            combined_audio.export(combined_audio_path, format="wav")
+
+            data = {
+                "chunk_size": None,
+                "image_quality": 70,
+                "start_frame": 0,
+                "stop_frame": None,
+                "frame_filter": "",
+                "client_files": ["combined_audio.wav"],
+                "server_files": [],
+                "remote_files": [],
+                "use_zip_chunks": False,
+                "server_files_exclude": [],
+                "use_cache": False,
+                "copy_data": False,
+                "storage_method": "file_system",
+                "storage": "local",
+                "sorting_method": "lexicographical",
+                "filename_pattern": None,
+            }
+
+            _create_thread(
+                locked_instance, data, is_task_import=True, temp_dir=temp_dir
+            )
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                all_rows = list(reader)
+
+                new_rows = split_rows_by_time(all_rows)
+
+            jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
+
+            label_data = InstanceLabelData(instance_data.db_project)
+
+            record_index = 0
+            for job in jobs:
+                start_time = 0
+
+                while record_index < len(new_rows):
+                    record = new_rows[record_index]
+
+                    record_duration = calculate_duration(record)
+
+                    end_time = start_time + record_duration
+
+                    label_name = record.get("label")
+                    label_id = label_data._get_label_id(label_name)
+
+                    attributes = []
+
+                    # Process dynamic attribute_i_name and attribute_i_value fields
+                    attribute_index = 1  # Start with the first attribute
+                    while True:
+                        attribute_name_key = f"attribute_{attribute_index}_name"
+                        attribute_value_key = f"attribute_{attribute_index}_value"
+
+                        # Check if the keys exist in the record
+                        if (
+                            attribute_name_key in record
+                            and attribute_value_key in record
+                        ):
+                            attribute_name = record.get(attribute_name_key)
+                            attribute_value = record.get(attribute_value_key)
+
+                            if attribute_name and attribute_value:
+                                spec_id = label_data._get_attribute_id(
+                                    label_id, attribute_name
+                                )
+                                attributes.append(
+                                    {
+                                        "spec_id": spec_id,
+                                        "value": attribute_value,
+                                    }
+                                )
+
+                            attribute_index += 1  # Move to the next attribute index
+                        else:
+                            break  # Exit the loop when no more attributes are found
+
+                    language_id_to_locale_mapping = {
+                        0: "en-US",
+                        1: "es-ES",
+                        2: "fr-FR",
+                        3: "zh-CN",
+                        4: "hi-IN",
+                        5: "ar-EG",
+                        6: "pt-BR",
+                        7: "ja-JP",
+                        8: "de-DE",
+                        9: "ru-RU",
+                    }
+
+                    # defaults to -1 if language field not in tsv, locale will be an empty string
+                    language_id = (
+                        int(float(record.get("language", -1)))
+                        if record.get("language")
+                        else -1
+                    )
+
+                    shapes_data = [
+                        {
+                            "type": "rectangle",
+                            "label": record.get("label", ""),
+                            "points": [start_time, start_time, end_time, end_time],
+                            "frame": 0,
+                            "occluded": False,
+                            "z_order": 0,
+                            "group": None,
+                            "source": "manual",
+                            "transcript": record.get("text", ""),
+                            "gender": record.get("gender", ""),
+                            "age": record.get("age", ""),
+                            "locale": language_id_to_locale_mapping.get(
+                                language_id, ""
+                            ),
+                            "accent": record.get("accent", ""),
+                            "emotion": record.get("emotion", ""),
+                            "rotation": 0.0,
+                            "label_id": label_id,
+                            "attributes": attributes,
+                        }
+                    ]
+
+                    data = {"shapes": shapes_data}
+                    start_time = end_time
+
+                    serializer = LabeledDataSerializer(data=data)
+                    pk = int(job.id)
+                    action = PatchAction.CREATE
+
+                    if serializer.is_valid(raise_exception=True):
+                        data = dm.task.patch_job_data(pk, serializer.data, action)
+
+                    record_index += 1
+                    total_duration = round(end_time, 2)
+                    if 599.9 <= total_duration <= 600:
+                        break
+
+        else:
+            anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
+            for p in anno_paths:
+                load_anno(p, instance_data)
diff --git a/cvat/apps/dataset_manager/formats/VoxCeleb.py b/cvat/apps/dataset_manager/formats/VoxCeleb.py
index 139b7473a30d..ab527a5aa3d5 100644
--- a/cvat/apps/dataset_manager/formats/VoxCeleb.py
+++ b/cvat/apps/dataset_manager/formats/VoxCeleb.py
@@ -1,12 +1,62 @@
+import os
 import os.path as osp
 import zipfile
+import csv
+from django.db import transaction
 from glob import glob
+from pydub import AudioSegment
 from cvat.apps.dataset_manager.bindings import InstanceLabelData
 from cvat.apps.engine.serializers import LabeledDataSerializer
 import cvat.apps.dataset_manager as dm
 from cvat.apps.dataset_manager.task import PatchAction
 from .registry import importer
-from cvat.apps.engine.models import Task, Job
+from cvat.apps.engine.models import Job, Task, Data
+from cvat.apps.engine.task import _create_thread
+from cvat.apps.dataset_manager.bindings import ProjectData
+
+
+def calculate_duration(row):
+    start_time = float(row["start"])  # Assuming start and end times are in seconds
+    end_time = float(row["end"])
+    return end_time - start_time
+
+
+def split_rows_by_time(all_rows, time_threshold=600):
+    result = []
+
+    total_duration = 0
+
+    for row in all_rows:
+        start_time = float(row["start"])
+        end_time = float(row["end"])
+        duration = end_time - start_time
+
+        total_duration += duration
+
+        if total_duration > time_threshold:
+            # split logic here
+            total_duration_till_previous_row = total_duration - duration
+            remaining_time = time_threshold - total_duration_till_previous_row
+
+            first_part = row.copy()
+            first_part["end"] = str(float(first_part["start"]) + remaining_time)
+
+            second_part = row.copy()
+            second_part["start"] = first_part["end"]
+
+            result.append(first_part)
+            result.append(second_part)
+
+            second_part_duration = float(second_part["end"]) - float(
+                second_part["start"]
+            )
+
+            total_duration = second_part_duration
+
+        else:
+            result.append(row)
+
+    return result
 
 
 def load_anno(file_object, annotations):
@@ -39,22 +89,6 @@ def load_anno(file_object, annotations):
         label_name = record.get("label")
         label_id = label_data._get_label_id(label_name)
 
-        language_id_to_locale_mapping = {
-            0: "en-US",
-            1: "es-ES",
-            2: "fr-FR",
-            3: "zh-CN",
-            4: "hi-IN",
-            5: "ar-EG",
-            6: "pt-BR",
-            7: "ja-JP",
-            8: "de-DE",
-            9: "ru-RU",
-        }
-
-        # defaults to -1 if language field not in tsv, locale will be an empty string
-        language_id = int(record.get("language", -1))
-
         attributes = []
 
         for i in range(1, len(headers)):
@@ -76,6 +110,24 @@ def load_anno(file_object, annotations):
                         }
                     )
 
+        language_id_to_locale_mapping = {
+            0: "en-US",
+            1: "es-ES",
+            2: "fr-FR",
+            3: "zh-CN",
+            4: "hi-IN",
+            5: "ar-EG",
+            6: "pt-BR",
+            7: "ja-JP",
+            8: "de-DE",
+            9: "ru-RU",
+        }
+
+        # defaults to -1 if language field not in tsv, locale will be an empty string
+        language_id = (
+            int(float(record.get("language", -1))) if record.get("language") else -1
+        )
+
         shapes_data = [
             {
                 "type": "rectangle",
@@ -112,9 +164,209 @@ def load_anno(file_object, annotations):
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
     is_zip = zipfile.is_zipfile(src_file)
     src_file.seek(0)
+    file_name = os.path.basename(src_file.name)
+    name_without_extension = os.path.splitext(file_name)[0]
+
     if is_zip:
         zipfile.ZipFile(src_file).extractall(temp_dir)
 
-        anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
-        for p in anno_paths:
-            load_anno(p, instance_data)
+        if isinstance(instance_data, ProjectData):
+            project = instance_data.db_project
+            new_task = Task.objects.create(
+                project=project,
+                name=name_without_extension,
+                segment_size=0,
+            )
+            new_task.save()
+
+            with transaction.atomic():
+                locked_instance = Task.objects.select_for_update().get(pk=new_task.id)
+                task_data = locked_instance.data
+                if not task_data:
+                    task_data = Data.objects.create()
+                    task_data.make_dirs()
+                    locked_instance.data = task_data
+                    locked_instance.save()
+
+            clips_folder = os.path.join(temp_dir, "clips")
+            tsv_file_path = os.path.join(temp_dir, "data.tsv")
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                tsv_rows = list(reader)
+
+                num_tsv_rows = len(tsv_rows)
+                num_clips = len(os.listdir(clips_folder))
+
+                if num_tsv_rows != num_clips:
+                    raise ValueError(
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                    )
+
+            # Combined audio that will be the final output
+            combined_audio = AudioSegment.empty()
+
+            # Read TSV file to get the ordered list of audio files
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+
+                for row in reader:
+                    audio_file_name = row[
+                        "file"
+                    ]  # Assuming 'file' column contains audio file names
+                    file_path = os.path.join(clips_folder, audio_file_name)
+
+                    if os.path.isfile(file_path):
+                        audio_segment = AudioSegment.from_file(file_path)
+                        combined_audio += (
+                            audio_segment  # Append the audio in the order from TSV
+                        )
+
+            # Create raw folder to store combined audio
+            raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
+            os.makedirs(raw_folder_path, exist_ok=True)
+
+            combined_audio_path = os.path.join(raw_folder_path, "combined_audio.wav")
+            combined_audio.export(combined_audio_path, format="wav")
+
+            data = {
+                "chunk_size": None,
+                "image_quality": 70,
+                "start_frame": 0,
+                "stop_frame": None,
+                "frame_filter": "",
+                "client_files": ["combined_audio.wav"],
+                "server_files": [],
+                "remote_files": [],
+                "use_zip_chunks": False,
+                "server_files_exclude": [],
+                "use_cache": False,
+                "copy_data": False,
+                "storage_method": "file_system",
+                "storage": "local",
+                "sorting_method": "lexicographical",
+                "filename_pattern": None,
+            }
+
+            _create_thread(
+                locked_instance, data, is_task_import=True, temp_dir=temp_dir
+            )
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                all_rows = list(reader)
+
+                new_rows = split_rows_by_time(all_rows)
+
+            jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
+
+            label_data = InstanceLabelData(instance_data.db_project)
+
+            record_index = 0
+            for job in jobs:
+                start_time = 0
+
+                while record_index < len(new_rows):
+                    record = new_rows[record_index]
+
+                    record_duration = calculate_duration(record)
+
+                    end_time = start_time + record_duration
+
+                    label_name = record.get("label")
+                    label_id = label_data._get_label_id(label_name)
+
+                    attributes = []
+
+                    # Process dynamic attribute_i_name and attribute_i_value fields
+                    attribute_index = 1  # Start with the first attribute
+                    while True:
+                        attribute_name_key = f"attribute_{attribute_index}_name"
+                        attribute_value_key = f"attribute_{attribute_index}_value"
+
+                        # Check if the keys exist in the record
+                        if (
+                            attribute_name_key in record
+                            and attribute_value_key in record
+                        ):
+                            attribute_name = record.get(attribute_name_key)
+                            attribute_value = record.get(attribute_value_key)
+
+                            if attribute_name and attribute_value:
+                                spec_id = label_data._get_attribute_id(
+                                    label_id, attribute_name
+                                )
+                                attributes.append(
+                                    {
+                                        "spec_id": spec_id,
+                                        "value": attribute_value,
+                                    }
+                                )
+
+                            attribute_index += 1  # Move to the next attribute index
+                        else:
+                            break  # Exit the loop when no more attributes are found
+
+                    language_id_to_locale_mapping = {
+                        0: "en-US",
+                        1: "es-ES",
+                        2: "fr-FR",
+                        3: "zh-CN",
+                        4: "hi-IN",
+                        5: "ar-EG",
+                        6: "pt-BR",
+                        7: "ja-JP",
+                        8: "de-DE",
+                        9: "ru-RU",
+                    }
+
+                    # defaults to -1 if language field not in tsv, locale will be an empty string
+                    language_id = (
+                        int(float(record.get("language", -1)))
+                        if record.get("language")
+                        else -1
+                    )
+
+                    shapes_data = [
+                        {
+                            "type": "rectangle",
+                            "label": record.get("label", ""),
+                            "points": [start_time, start_time, end_time, end_time],
+                            "frame": 0,
+                            "occluded": False,
+                            "z_order": 0,
+                            "group": None,
+                            "source": "manual",
+                            "transcript": record.get("text", ""),
+                            "gender": record.get("gender", ""),
+                            "age": record.get("age", ""),
+                            "locale": language_id_to_locale_mapping.get(
+                                language_id, ""
+                            ),
+                            "accent": record.get("accent", ""),
+                            "emotion": record.get("emotion", ""),
+                            "rotation": 0.0,
+                            "label_id": label_id,
+                            "attributes": attributes,
+                        }
+                    ]
+
+                    data = {"shapes": shapes_data}
+                    start_time = end_time
+
+                    serializer = LabeledDataSerializer(data=data)
+                    pk = int(job.id)
+                    action = PatchAction.CREATE
+
+                    if serializer.is_valid(raise_exception=True):
+                        data = dm.task.patch_job_data(pk, serializer.data, action)
+
+                    record_index += 1
+                    total_duration = round(end_time, 2)
+                    if 599.9 <= total_duration <= 600:
+                        break
+
+        else:
+            anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
+            for p in anno_paths:
+                load_anno(p, instance_data)
diff --git a/cvat/apps/dataset_manager/formats/Voxpopuli.py b/cvat/apps/dataset_manager/formats/Voxpopuli.py
index 1f6905f6e9e0..17c179ceb75b 100644
--- a/cvat/apps/dataset_manager/formats/Voxpopuli.py
+++ b/cvat/apps/dataset_manager/formats/Voxpopuli.py
@@ -1,12 +1,62 @@
+import os
 import os.path as osp
 import zipfile
+import csv
+from django.db import transaction
 from glob import glob
+from pydub import AudioSegment
 from cvat.apps.dataset_manager.bindings import InstanceLabelData
 from cvat.apps.engine.serializers import LabeledDataSerializer
 import cvat.apps.dataset_manager as dm
 from cvat.apps.dataset_manager.task import PatchAction
 from .registry import importer
-from cvat.apps.engine.models import Task, Job
+from cvat.apps.engine.models import Job, Task, Data
+from cvat.apps.engine.task import _create_thread
+from cvat.apps.dataset_manager.bindings import ProjectData
+
+
+def calculate_duration(row):
+    start_time = float(row["start"])  # Assuming start and end times are in seconds
+    end_time = float(row["end"])
+    return end_time - start_time
+
+
+def split_rows_by_time(all_rows, time_threshold=600):
+    result = []
+
+    total_duration = 0
+
+    for row in all_rows:
+        start_time = float(row["start"])
+        end_time = float(row["end"])
+        duration = end_time - start_time
+
+        total_duration += duration
+
+        if total_duration > time_threshold:
+            # split logic here
+            total_duration_till_previous_row = total_duration - duration
+            remaining_time = time_threshold - total_duration_till_previous_row
+
+            first_part = row.copy()
+            first_part["end"] = str(float(first_part["start"]) + remaining_time)
+
+            second_part = row.copy()
+            second_part["start"] = first_part["end"]
+
+            result.append(first_part)
+            result.append(second_part)
+
+            second_part_duration = float(second_part["end"]) - float(
+                second_part["start"]
+            )
+
+            total_duration = second_part_duration
+
+        else:
+            result.append(row)
+
+    return result
 
 
 def load_anno(file_object, annotations):
@@ -39,22 +89,6 @@ def load_anno(file_object, annotations):
         label_name = record.get("label")
         label_id = label_data._get_label_id(label_name)
 
-        language_id_to_locale_mapping = {
-            0: "en-US",
-            1: "es-ES",
-            2: "fr-FR",
-            3: "zh-CN",
-            4: "hi-IN",
-            5: "ar-EG",
-            6: "pt-BR",
-            7: "ja-JP",
-            8: "de-DE",
-            9: "ru-RU",
-        }
-
-        # defaults to -1 if language field not in tsv, locale will be an empty string
-        language_id = int(record.get("language", -1))
-
         attributes = []
 
         for i in range(1, len(headers)):
@@ -76,6 +110,24 @@ def load_anno(file_object, annotations):
                         }
                     )
 
+        language_id_to_locale_mapping = {
+            0: "en-US",
+            1: "es-ES",
+            2: "fr-FR",
+            3: "zh-CN",
+            4: "hi-IN",
+            5: "ar-EG",
+            6: "pt-BR",
+            7: "ja-JP",
+            8: "de-DE",
+            9: "ru-RU",
+        }
+
+        # defaults to -1 if language field not in tsv, locale will be an empty string
+        language_id = (
+            int(float(record.get("language", -1))) if record.get("language") else -1
+        )
+
         shapes_data = [
             {
                 "type": "rectangle",
@@ -112,9 +164,209 @@ def load_anno(file_object, annotations):
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
     is_zip = zipfile.is_zipfile(src_file)
     src_file.seek(0)
+    file_name = os.path.basename(src_file.name)
+    name_without_extension = os.path.splitext(file_name)[0]
+
     if is_zip:
         zipfile.ZipFile(src_file).extractall(temp_dir)
 
-        anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
-        for p in anno_paths:
-            load_anno(p, instance_data)
+        if isinstance(instance_data, ProjectData):
+            project = instance_data.db_project
+            new_task = Task.objects.create(
+                project=project,
+                name=name_without_extension,
+                segment_size=0,
+            )
+            new_task.save()
+
+            with transaction.atomic():
+                locked_instance = Task.objects.select_for_update().get(pk=new_task.id)
+                task_data = locked_instance.data
+                if not task_data:
+                    task_data = Data.objects.create()
+                    task_data.make_dirs()
+                    locked_instance.data = task_data
+                    locked_instance.save()
+
+            clips_folder = os.path.join(temp_dir, "clips")
+            tsv_file_path = os.path.join(temp_dir, "data.tsv")
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                tsv_rows = list(reader)
+
+                num_tsv_rows = len(tsv_rows)
+                num_clips = len(os.listdir(clips_folder))
+
+                if num_tsv_rows != num_clips:
+                    raise ValueError(
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                    )
+
+            # Combined audio that will be the final output
+            combined_audio = AudioSegment.empty()
+
+            # Read TSV file to get the ordered list of audio files
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+
+                for row in reader:
+                    audio_file_name = row[
+                        "audio_path"
+                    ]  # Assuming 'file' column contains audio file names
+                    file_path = os.path.join(clips_folder, audio_file_name)
+
+                    if os.path.isfile(file_path):
+                        audio_segment = AudioSegment.from_file(file_path)
+                        combined_audio += (
+                            audio_segment  # Append the audio in the order from TSV
+                        )
+
+            # Create raw folder to store combined audio
+            raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
+            os.makedirs(raw_folder_path, exist_ok=True)
+
+            combined_audio_path = os.path.join(raw_folder_path, "combined_audio.wav")
+            combined_audio.export(combined_audio_path, format="wav")
+
+            data = {
+                "chunk_size": None,
+                "image_quality": 70,
+                "start_frame": 0,
+                "stop_frame": None,
+                "frame_filter": "",
+                "client_files": ["combined_audio.wav"],
+                "server_files": [],
+                "remote_files": [],
+                "use_zip_chunks": False,
+                "server_files_exclude": [],
+                "use_cache": False,
+                "copy_data": False,
+                "storage_method": "file_system",
+                "storage": "local",
+                "sorting_method": "lexicographical",
+                "filename_pattern": None,
+            }
+
+            _create_thread(
+                locked_instance, data, is_task_import=True, temp_dir=temp_dir
+            )
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                all_rows = list(reader)
+
+                new_rows = split_rows_by_time(all_rows)
+
+            jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
+
+            label_data = InstanceLabelData(instance_data.db_project)
+
+            record_index = 0
+            for job in jobs:
+                start_time = 0
+
+                while record_index < len(new_rows):
+                    record = new_rows[record_index]
+
+                    record_duration = calculate_duration(record)
+
+                    end_time = start_time + record_duration
+
+                    label_name = record.get("label")
+                    label_id = label_data._get_label_id(label_name)
+
+                    attributes = []
+
+                    # Process dynamic attribute_i_name and attribute_i_value fields
+                    attribute_index = 1  # Start with the first attribute
+                    while True:
+                        attribute_name_key = f"attribute_{attribute_index}_name"
+                        attribute_value_key = f"attribute_{attribute_index}_value"
+
+                        # Check if the keys exist in the record
+                        if (
+                            attribute_name_key in record
+                            and attribute_value_key in record
+                        ):
+                            attribute_name = record.get(attribute_name_key)
+                            attribute_value = record.get(attribute_value_key)
+
+                            if attribute_name and attribute_value:
+                                spec_id = label_data._get_attribute_id(
+                                    label_id, attribute_name
+                                )
+                                attributes.append(
+                                    {
+                                        "spec_id": spec_id,
+                                        "value": attribute_value,
+                                    }
+                                )
+
+                            attribute_index += 1  # Move to the next attribute index
+                        else:
+                            break  # Exit the loop when no more attributes are found
+
+                    language_id_to_locale_mapping = {
+                        0: "en-US",
+                        1: "es-ES",
+                        2: "fr-FR",
+                        3: "zh-CN",
+                        4: "hi-IN",
+                        5: "ar-EG",
+                        6: "pt-BR",
+                        7: "ja-JP",
+                        8: "de-DE",
+                        9: "ru-RU",
+                    }
+
+                    # defaults to -1 if language field not in tsv, locale will be an empty string
+                    language_id = (
+                        int(float(record.get("language", -1)))
+                        if record.get("language")
+                        else -1
+                    )
+
+                    shapes_data = [
+                        {
+                            "type": "rectangle",
+                            "label": record.get("label", ""),
+                            "points": [start_time, start_time, end_time, end_time],
+                            "frame": 0,
+                            "occluded": False,
+                            "z_order": 0,
+                            "group": None,
+                            "source": "manual",
+                            "transcript": record.get("normalized_text", ""),
+                            "gender": record.get("gender", ""),
+                            "age": record.get("age", ""),
+                            "locale": language_id_to_locale_mapping.get(
+                                language_id, ""
+                            ),
+                            "accent": record.get("accent", ""),
+                            "emotion": record.get("emotion", ""),
+                            "rotation": 0.0,
+                            "label_id": label_id,
+                            "attributes": attributes,
+                        }
+                    ]
+
+                    data = {"shapes": shapes_data}
+                    start_time = end_time
+
+                    serializer = LabeledDataSerializer(data=data)
+                    pk = int(job.id)
+                    action = PatchAction.CREATE
+
+                    if serializer.is_valid(raise_exception=True):
+                        data = dm.task.patch_job_data(pk, serializer.data, action)
+
+                    record_index += 1
+                    total_duration = round(end_time, 2)
+                    if 599.9 <= total_duration <= 600:
+                        break
+
+        else:
+            anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
+            for p in anno_paths:
+                load_anno(p, instance_data)
diff --git a/cvat/apps/dataset_manager/formats/common_voice.py b/cvat/apps/dataset_manager/formats/common_voice.py
index 1e6be783ded1..9593d3b007ab 100644
--- a/cvat/apps/dataset_manager/formats/common_voice.py
+++ b/cvat/apps/dataset_manager/formats/common_voice.py
@@ -1,12 +1,62 @@
+import os
 import os.path as osp
 import zipfile
+import csv
+from django.db import transaction
 from glob import glob
+from pydub import AudioSegment
 from cvat.apps.dataset_manager.bindings import InstanceLabelData
 from cvat.apps.engine.serializers import LabeledDataSerializer
 import cvat.apps.dataset_manager as dm
 from cvat.apps.dataset_manager.task import PatchAction
 from .registry import importer
-from cvat.apps.engine.models import Task, Job
+from cvat.apps.engine.models import Job, Task, Data
+from cvat.apps.engine.task import _create_thread
+from cvat.apps.dataset_manager.bindings import ProjectData
+
+
+def calculate_duration(row):
+    start_time = float(row["start"])  # Assuming start and end times are in seconds
+    end_time = float(row["end"])
+    return end_time - start_time
+
+
+def split_rows_by_time(all_rows, time_threshold=600):
+    result = []
+
+    total_duration = 0
+
+    for row in all_rows:
+        start_time = float(row["start"])
+        end_time = float(row["end"])
+        duration = end_time - start_time
+
+        total_duration += duration
+
+        if total_duration > time_threshold:
+            # split logic here
+            total_duration_till_previous_row = total_duration - duration
+            remaining_time = time_threshold - total_duration_till_previous_row
+
+            first_part = row.copy()
+            first_part["end"] = str(float(first_part["start"]) + remaining_time)
+
+            second_part = row.copy()
+            second_part["start"] = first_part["end"]
+
+            result.append(first_part)
+            result.append(second_part)
+
+            second_part_duration = float(second_part["end"]) - float(
+                second_part["start"]
+            )
+
+            total_duration = second_part_duration
+
+        else:
+            result.append(row)
+
+    return result
 
 
 def load_anno(file_object, annotations):
@@ -96,9 +146,188 @@ def load_anno(file_object, annotations):
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
     is_zip = zipfile.is_zipfile(src_file)
     src_file.seek(0)
+    file_name = os.path.basename(src_file.name)
+    name_without_extension = os.path.splitext(file_name)[0]
     if is_zip:
         zipfile.ZipFile(src_file).extractall(temp_dir)
 
-        anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
-        for p in anno_paths:
-            load_anno(p, instance_data)
+        if isinstance(instance_data, ProjectData):
+            project = instance_data.db_project
+            new_task = Task.objects.create(
+                project=project,
+                name=name_without_extension,
+                segment_size=0,
+            )
+            new_task.save()
+
+            with transaction.atomic():
+                locked_instance = Task.objects.select_for_update().get(pk=new_task.id)
+                task_data = locked_instance.data
+                if not task_data:
+                    task_data = Data.objects.create()
+                    task_data.make_dirs()
+                    locked_instance.data = task_data
+                    locked_instance.save()
+
+            clips_folder = os.path.join(temp_dir, "clips")
+            tsv_file_path = os.path.join(temp_dir, "data.tsv")
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                tsv_rows = list(reader)
+
+                num_tsv_rows = len(tsv_rows)
+                num_clips = len(os.listdir(clips_folder))
+
+                if num_tsv_rows != num_clips:
+                    raise ValueError(
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                    )
+
+            # Combined audio that will be the final output
+            combined_audio = AudioSegment.empty()
+
+            # Read TSV file to get the ordered list of audio files
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+
+                for row in reader:
+                    audio_file_name = row[
+                        "path"
+                    ]  # Assuming 'file' column contains audio file names
+                    file_path = os.path.join(clips_folder, audio_file_name)
+
+                    if os.path.isfile(file_path):
+                        audio_segment = AudioSegment.from_file(file_path)
+                        combined_audio += (
+                            audio_segment  # Append the audio in the order from TSV
+                        )
+
+            # Create raw folder to store combined audio
+            raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
+            os.makedirs(raw_folder_path, exist_ok=True)
+
+            combined_audio_path = os.path.join(raw_folder_path, "combined_audio.wav")
+            combined_audio.export(combined_audio_path, format="wav")
+
+            data = {
+                "chunk_size": None,
+                "image_quality": 70,
+                "start_frame": 0,
+                "stop_frame": None,
+                "frame_filter": "",
+                "client_files": ["combined_audio.wav"],
+                "server_files": [],
+                "remote_files": [],
+                "use_zip_chunks": False,
+                "server_files_exclude": [],
+                "use_cache": False,
+                "copy_data": False,
+                "storage_method": "file_system",
+                "storage": "local",
+                "sorting_method": "lexicographical",
+                "filename_pattern": None,
+            }
+
+            _create_thread(
+                locked_instance, data, is_task_import=True, temp_dir=temp_dir
+            )
+
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                all_rows = list(reader)
+
+            new_rows = split_rows_by_time(all_rows)
+
+            jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
+
+            label_data = InstanceLabelData(instance_data.db_project)
+
+            record_index = 0
+            for job in jobs:
+                start_time = 0
+
+                while record_index < len(new_rows):
+                    record = new_rows[record_index]
+
+                    record_duration = calculate_duration(record)
+
+                    end_time = start_time + record_duration
+
+                    label_name = record.get("label")
+                    label_id = label_data._get_label_id(label_name)
+
+                    attributes = []
+
+                    # Process dynamic attribute_i_name and attribute_i_value fields
+                    attribute_index = 1  # Start with the first attribute
+                    while True:
+                        attribute_name_key = f"attribute_{attribute_index}_name"
+                        attribute_value_key = f"attribute_{attribute_index}_value"
+
+                        # Check if the keys exist in the record
+                        if (
+                            attribute_name_key in record
+                            and attribute_value_key in record
+                        ):
+                            attribute_name = record.get(attribute_name_key)
+                            attribute_value = record.get(attribute_value_key)
+
+                            if attribute_name and attribute_value:
+                                spec_id = label_data._get_attribute_id(
+                                    label_id, attribute_name
+                                )
+                                attributes.append(
+                                    {
+                                        "spec_id": spec_id,
+                                        "value": attribute_value,
+                                    }
+                                )
+
+                            attribute_index += 1  # Move to the next attribute index
+                        else:
+                            break  # Exit the loop when no more attributes are found
+
+                    shapes_data = [
+                        {
+                            "type": "rectangle",
+                            "label": record.get("label", ""),
+                            "points": [start_time, start_time, end_time, end_time],
+                            "frame": 0,
+                            "occluded": False,
+                            "z_order": 0,
+                            "group": None,
+                            "source": "manual",
+                            "transcript": record.get("sentence", ""),
+                            "gender": record.get("gender", ""),
+                            "age": record.get("age", ""),
+                            "locale": record.get("locale", ""),
+                            "accent": record.get("accents", ""),
+                            "emotion": record.get("emotion", ""),
+                            "rotation": 0.0,
+                            "label_id": label_id,
+                            "attributes": attributes,
+                        }
+                    ]
+
+                    data = {"shapes": shapes_data}
+                    start_time = end_time
+
+                    serializer = LabeledDataSerializer(data=data)
+                    pk = int(job.id)
+                    action = PatchAction.CREATE
+
+                    if serializer.is_valid(raise_exception=True):
+                        data = dm.task.patch_job_data(pk, serializer.data, action)
+
+                    record_index += 1
+                    total_duration = round(end_time, 2)
+                    if 599.9 <= total_duration <= 600:
+                        break
+
+        else:
+
+            anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
+            for p in anno_paths:
+                load_anno(p, instance_data)
diff --git a/cvat/apps/dataset_manager/formats/librispeech.py b/cvat/apps/dataset_manager/formats/librispeech.py
index 4ec6e7543699..5d5018b9edf6 100644
--- a/cvat/apps/dataset_manager/formats/librispeech.py
+++ b/cvat/apps/dataset_manager/formats/librispeech.py
@@ -1,12 +1,62 @@
+import os
 import os.path as osp
 import zipfile
+import csv
+from django.db import transaction
 from glob import glob
+from pydub import AudioSegment
 from cvat.apps.dataset_manager.bindings import InstanceLabelData
 from cvat.apps.engine.serializers import LabeledDataSerializer
 import cvat.apps.dataset_manager as dm
 from cvat.apps.dataset_manager.task import PatchAction
 from .registry import importer
-from cvat.apps.engine.models import Task, Job
+from cvat.apps.engine.models import Job, Task, Data
+from cvat.apps.engine.task import _create_thread
+from cvat.apps.dataset_manager.bindings import ProjectData
+
+
+def calculate_duration(row):
+    start_time = float(row["start"])  # Assuming start and end times are in seconds
+    end_time = float(row["end"])
+    return end_time - start_time
+
+
+def split_rows_by_time(all_rows, time_threshold=600):
+    result = []
+
+    total_duration = 0
+
+    for row in all_rows:
+        start_time = float(row["start"])
+        end_time = float(row["end"])
+        duration = end_time - start_time
+
+        total_duration += duration
+
+        if total_duration > time_threshold:
+            # split logic here
+            total_duration_till_previous_row = total_duration - duration
+            remaining_time = time_threshold - total_duration_till_previous_row
+
+            first_part = row.copy()
+            first_part["end"] = str(float(first_part["start"]) + remaining_time)
+
+            second_part = row.copy()
+            second_part["start"] = first_part["end"]
+
+            result.append(first_part)
+            result.append(second_part)
+
+            second_part_duration = float(second_part["end"]) - float(
+                second_part["start"]
+            )
+
+            total_duration = second_part_duration
+
+        else:
+            result.append(row)
+
+    return result
 
 
 def load_anno(file_object, annotations):
@@ -74,7 +124,9 @@ def load_anno(file_object, annotations):
         }
 
         # defaults to -1 if language field not in tsv, locale will be an empty string
-        language_id = int(record.get("language", -1))
+        language_id = (
+            int(float(record.get("language", -1))) if record.get("language") else -1
+        )
 
         shapes_data = [
             {
@@ -112,9 +164,209 @@ def load_anno(file_object, annotations):
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
     is_zip = zipfile.is_zipfile(src_file)
     src_file.seek(0)
+    file_name = os.path.basename(src_file.name)
+    name_without_extension = os.path.splitext(file_name)[0]
+
     if is_zip:
         zipfile.ZipFile(src_file).extractall(temp_dir)
 
-        anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
-        for p in anno_paths:
-            load_anno(p, instance_data)
+        if isinstance(instance_data, ProjectData):
+            project = instance_data.db_project
+            new_task = Task.objects.create(
+                project=project,
+                name=name_without_extension,
+                segment_size=0,
+            )
+            new_task.save()
+
+            with transaction.atomic():
+                locked_instance = Task.objects.select_for_update().get(pk=new_task.id)
+                task_data = locked_instance.data
+                if not task_data:
+                    task_data = Data.objects.create()
+                    task_data.make_dirs()
+                    locked_instance.data = task_data
+                    locked_instance.save()
+
+            clips_folder = os.path.join(temp_dir, "clips")
+            tsv_file_path = os.path.join(temp_dir, "data.tsv")
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                tsv_rows = list(reader)
+
+                num_tsv_rows = len(tsv_rows)
+                num_clips = len(os.listdir(clips_folder))
+
+                if num_tsv_rows != num_clips:
+                    raise ValueError(
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                    )
+
+            # Combined audio that will be the final output
+            combined_audio = AudioSegment.empty()
+
+            # Read TSV file to get the ordered list of audio files
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+
+                for row in reader:
+                    audio_file_name = row[
+                        "file"
+                    ]  # Assuming 'file' column contains audio file names
+                    file_path = os.path.join(clips_folder, audio_file_name)
+
+                    if os.path.isfile(file_path):
+                        audio_segment = AudioSegment.from_file(file_path)
+                        combined_audio += (
+                            audio_segment  # Append the audio in the order from TSV
+                        )
+
+            # Create raw folder to store combined audio
+            raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
+            os.makedirs(raw_folder_path, exist_ok=True)
+
+            combined_audio_path = os.path.join(raw_folder_path, "combined_audio.wav")
+            combined_audio.export(combined_audio_path, format="wav")
+
+            data = {
+                "chunk_size": None,
+                "image_quality": 70,
+                "start_frame": 0,
+                "stop_frame": None,
+                "frame_filter": "",
+                "client_files": ["combined_audio.wav"],
+                "server_files": [],
+                "remote_files": [],
+                "use_zip_chunks": False,
+                "server_files_exclude": [],
+                "use_cache": False,
+                "copy_data": False,
+                "storage_method": "file_system",
+                "storage": "local",
+                "sorting_method": "lexicographical",
+                "filename_pattern": None,
+            }
+
+            _create_thread(
+                locked_instance, data, is_task_import=True, temp_dir=temp_dir
+            )
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                all_rows = list(reader)
+
+                new_rows = split_rows_by_time(all_rows)
+
+            jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
+
+            label_data = InstanceLabelData(instance_data.db_project)
+
+            record_index = 0
+            for job in jobs:
+                start_time = 0
+
+                while record_index < len(new_rows):
+                    record = new_rows[record_index]
+
+                    record_duration = calculate_duration(record)
+
+                    end_time = start_time + record_duration
+
+                    label_name = record.get("label")
+                    label_id = label_data._get_label_id(label_name)
+
+                    attributes = []
+
+                    # Process dynamic attribute_i_name and attribute_i_value fields
+                    attribute_index = 1  # Start with the first attribute
+                    while True:
+                        attribute_name_key = f"attribute_{attribute_index}_name"
+                        attribute_value_key = f"attribute_{attribute_index}_value"
+
+                        # Check if the keys exist in the record
+                        if (
+                            attribute_name_key in record
+                            and attribute_value_key in record
+                        ):
+                            attribute_name = record.get(attribute_name_key)
+                            attribute_value = record.get(attribute_value_key)
+
+                            if attribute_name and attribute_value:
+                                spec_id = label_data._get_attribute_id(
+                                    label_id, attribute_name
+                                )
+                                attributes.append(
+                                    {
+                                        "spec_id": spec_id,
+                                        "value": attribute_value,
+                                    }
+                                )
+
+                            attribute_index += 1  # Move to the next attribute index
+                        else:
+                            break  # Exit the loop when no more attributes are found
+
+                    language_id_to_locale_mapping = {
+                        0: "en-US",
+                        1: "es-ES",
+                        2: "fr-FR",
+                        3: "zh-CN",
+                        4: "hi-IN",
+                        5: "ar-EG",
+                        6: "pt-BR",
+                        7: "ja-JP",
+                        8: "de-DE",
+                        9: "ru-RU",
+                    }
+
+                    # defaults to -1 if language field not in tsv, locale will be an empty string
+                    language_id = (
+                        int(float(record.get("language", -1)))
+                        if record.get("language")
+                        else -1
+                    )
+
+                    shapes_data = [
+                        {
+                            "type": "rectangle",
+                            "label": record.get("label", ""),
+                            "points": [start_time, start_time, end_time, end_time],
+                            "frame": 0,
+                            "occluded": False,
+                            "z_order": 0,
+                            "group": None,
+                            "source": "manual",
+                            "transcript": record.get("text", ""),
+                            "gender": record.get("gender", ""),
+                            "age": record.get("age", ""),
+                            "locale": language_id_to_locale_mapping.get(
+                                language_id, ""
+                            ),
+                            "accent": record.get("accent", ""),
+                            "emotion": record.get("emotion", ""),
+                            "rotation": 0.0,
+                            "label_id": label_id,
+                            "attributes": attributes,
+                        }
+                    ]
+
+                    data = {"shapes": shapes_data}
+                    start_time = end_time
+
+                    serializer = LabeledDataSerializer(data=data)
+                    pk = int(job.id)
+                    action = PatchAction.CREATE
+
+                    if serializer.is_valid(raise_exception=True):
+                        data = dm.task.patch_job_data(pk, serializer.data, action)
+
+                    record_index += 1
+                    total_duration = round(end_time, 2)
+                    if 599.9 <= total_duration <= 600:
+                        break
+
+        else:
+            anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
+            for p in anno_paths:
+                load_anno(p, instance_data)
diff --git a/cvat/apps/dataset_manager/formats/tedlium.py b/cvat/apps/dataset_manager/formats/tedlium.py
index 3cb49283d674..6463713e1c1f 100644
--- a/cvat/apps/dataset_manager/formats/tedlium.py
+++ b/cvat/apps/dataset_manager/formats/tedlium.py
@@ -1,12 +1,62 @@
+import os
 import os.path as osp
 import zipfile
+import csv
+from django.db import transaction
 from glob import glob
+from pydub import AudioSegment
 from cvat.apps.dataset_manager.bindings import InstanceLabelData
 from cvat.apps.engine.serializers import LabeledDataSerializer
 import cvat.apps.dataset_manager as dm
 from cvat.apps.dataset_manager.task import PatchAction
 from .registry import importer
-from cvat.apps.engine.models import Task, Job
+from cvat.apps.engine.models import Job, Task, Data
+from cvat.apps.engine.task import _create_thread
+from cvat.apps.dataset_manager.bindings import ProjectData
+
+
+def calculate_duration(row):
+    start_time = float(row["start"])  # Assuming start and end times are in seconds
+    end_time = float(row["end"])
+    return end_time - start_time
+
+
+def split_rows_by_time(all_rows, time_threshold=600):
+    result = []
+
+    total_duration = 0
+
+    for row in all_rows:
+        start_time = float(row["start"])
+        end_time = float(row["end"])
+        duration = end_time - start_time
+
+        total_duration += duration
+
+        if total_duration > time_threshold:
+            # split logic here
+            total_duration_till_previous_row = total_duration - duration
+            remaining_time = time_threshold - total_duration_till_previous_row
+
+            first_part = row.copy()
+            first_part["end"] = str(float(first_part["start"]) + remaining_time)
+
+            second_part = row.copy()
+            second_part["start"] = first_part["end"]
+
+            result.append(first_part)
+            result.append(second_part)
+
+            second_part_duration = float(second_part["end"]) - float(
+                second_part["start"]
+            )
+
+            total_duration = second_part_duration
+
+        else:
+            result.append(row)
+
+    return result
 
 
 def load_anno(file_object, annotations):
@@ -39,22 +89,6 @@ def load_anno(file_object, annotations):
         label_name = record.get("label")
         label_id = label_data._get_label_id(label_name)
 
-        language_id_to_locale_mapping = {
-            0: "en-US",
-            1: "es-ES",
-            2: "fr-FR",
-            3: "zh-CN",
-            4: "hi-IN",
-            5: "ar-EG",
-            6: "pt-BR",
-            7: "ja-JP",
-            8: "de-DE",
-            9: "ru-RU",
-        }
-
-        # defaults to -1 if language field not in tsv, locale will be an empty string
-        language_id = int(record.get("language", -1))
-
         attributes = []
 
         for i in range(1, len(headers)):
@@ -76,6 +110,24 @@ def load_anno(file_object, annotations):
                         }
                     )
 
+        language_id_to_locale_mapping = {
+            0: "en-US",
+            1: "es-ES",
+            2: "fr-FR",
+            3: "zh-CN",
+            4: "hi-IN",
+            5: "ar-EG",
+            6: "pt-BR",
+            7: "ja-JP",
+            8: "de-DE",
+            9: "ru-RU",
+        }
+
+        # defaults to -1 if language field not in tsv, locale will be an empty string
+        language_id = (
+            int(float(record.get("language", -1))) if record.get("language") else -1
+        )
+
         shapes_data = [
             {
                 "type": "rectangle",
@@ -112,9 +164,209 @@ def load_anno(file_object, annotations):
 def _import(src_file, temp_dir, instance_data, load_data_callback=None, **kwargs):
     is_zip = zipfile.is_zipfile(src_file)
     src_file.seek(0)
+    file_name = os.path.basename(src_file.name)
+    name_without_extension = os.path.splitext(file_name)[0]
+
     if is_zip:
         zipfile.ZipFile(src_file).extractall(temp_dir)
 
-        anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
-        for p in anno_paths:
-            load_anno(p, instance_data)
+        if isinstance(instance_data, ProjectData):
+            project = instance_data.db_project
+            new_task = Task.objects.create(
+                project=project,
+                name=name_without_extension,
+                segment_size=0,
+            )
+            new_task.save()
+
+            with transaction.atomic():
+                locked_instance = Task.objects.select_for_update().get(pk=new_task.id)
+                task_data = locked_instance.data
+                if not task_data:
+                    task_data = Data.objects.create()
+                    task_data.make_dirs()
+                    locked_instance.data = task_data
+                    locked_instance.save()
+
+            clips_folder = os.path.join(temp_dir, "clips")
+            tsv_file_path = os.path.join(temp_dir, "data.tsv")
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                tsv_rows = list(reader)
+
+                num_tsv_rows = len(tsv_rows)
+                num_clips = len(os.listdir(clips_folder))
+
+                if num_tsv_rows != num_clips:
+                    raise ValueError(
+                        f"Import failed: {num_tsv_rows} rows in TSV but {num_clips} audio clips in the clips folder. The numbers must match."
+                    )
+
+            # Combined audio that will be the final output
+            combined_audio = AudioSegment.empty()
+
+            # Read TSV file to get the ordered list of audio files
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+
+                for row in reader:
+                    audio_file_name = row[
+                        "file"
+                    ]  # Assuming 'file' column contains audio file names
+                    file_path = os.path.join(clips_folder, audio_file_name)
+
+                    if os.path.isfile(file_path):
+                        audio_segment = AudioSegment.from_file(file_path)
+                        combined_audio += (
+                            audio_segment  # Append the audio in the order from TSV
+                        )
+
+            # Create raw folder to store combined audio
+            raw_folder_path = os.path.join(task_data.get_data_dirname(), "raw")
+            os.makedirs(raw_folder_path, exist_ok=True)
+
+            combined_audio_path = os.path.join(raw_folder_path, "combined_audio.wav")
+            combined_audio.export(combined_audio_path, format="wav")
+
+            data = {
+                "chunk_size": None,
+                "image_quality": 70,
+                "start_frame": 0,
+                "stop_frame": None,
+                "frame_filter": "",
+                "client_files": ["combined_audio.wav"],
+                "server_files": [],
+                "remote_files": [],
+                "use_zip_chunks": False,
+                "server_files_exclude": [],
+                "use_cache": False,
+                "copy_data": False,
+                "storage_method": "file_system",
+                "storage": "local",
+                "sorting_method": "lexicographical",
+                "filename_pattern": None,
+            }
+
+            _create_thread(
+                locked_instance, data, is_task_import=True, temp_dir=temp_dir
+            )
+
+            with open(tsv_file_path, "r", newline="", encoding="utf-8") as tsvfile:
+                reader = csv.DictReader(tsvfile, delimiter="\t")
+                all_rows = list(reader)
+
+                new_rows = split_rows_by_time(all_rows)
+
+            jobs = Job.objects.filter(segment__task=locked_instance).order_by("id")
+
+            label_data = InstanceLabelData(instance_data.db_project)
+
+            record_index = 0
+            for job in jobs:
+                start_time = 0
+
+                while record_index < len(new_rows):
+                    record = new_rows[record_index]
+
+                    record_duration = calculate_duration(record)
+
+                    end_time = start_time + record_duration
+
+                    label_name = record.get("label")
+                    label_id = label_data._get_label_id(label_name)
+
+                    attributes = []
+
+                    # Process dynamic attribute_i_name and attribute_i_value fields
+                    attribute_index = 1  # Start with the first attribute
+                    while True:
+                        attribute_name_key = f"attribute_{attribute_index}_name"
+                        attribute_value_key = f"attribute_{attribute_index}_value"
+
+                        # Check if the keys exist in the record
+                        if (
+                            attribute_name_key in record
+                            and attribute_value_key in record
+                        ):
+                            attribute_name = record.get(attribute_name_key)
+                            attribute_value = record.get(attribute_value_key)
+
+                            if attribute_name and attribute_value:
+                                spec_id = label_data._get_attribute_id(
+                                    label_id, attribute_name
+                                )
+                                attributes.append(
+                                    {
+                                        "spec_id": spec_id,
+                                        "value": attribute_value,
+                                    }
+                                )
+
+                            attribute_index += 1  # Move to the next attribute index
+                        else:
+                            break  # Exit the loop when no more attributes are found
+
+                    language_id_to_locale_mapping = {
+                        0: "en-US",
+                        1: "es-ES",
+                        2: "fr-FR",
+                        3: "zh-CN",
+                        4: "hi-IN",
+                        5: "ar-EG",
+                        6: "pt-BR",
+                        7: "ja-JP",
+                        8: "de-DE",
+                        9: "ru-RU",
+                    }
+
+                    # defaults to -1 if language field not in tsv, locale will be an empty string
+                    language_id = (
+                        int(float(record.get("language", -1)))
+                        if record.get("language")
+                        else -1
+                    )
+
+                    shapes_data = [
+                        {
+                            "type": "rectangle",
+                            "label": record.get("label", ""),
+                            "points": [start_time, start_time, end_time, end_time],
+                            "frame": 0,
+                            "occluded": False,
+                            "z_order": 0,
+                            "group": None,
+                            "source": "manual",
+                            "transcript": record.get("text", ""),
+                            "gender": record.get("gender", ""),
+                            "age": record.get("age", ""),
+                            "locale": language_id_to_locale_mapping.get(
+                                language_id, ""
+                            ),
+                            "accent": record.get("accent", ""),
+                            "emotion": record.get("emotion", ""),
+                            "rotation": 0.0,
+                            "label_id": label_id,
+                            "attributes": attributes,
+                        }
+                    ]
+
+                    data = {"shapes": shapes_data}
+                    start_time = end_time
+
+                    serializer = LabeledDataSerializer(data=data)
+                    pk = int(job.id)
+                    action = PatchAction.CREATE
+
+                    if serializer.is_valid(raise_exception=True):
+                        data = dm.task.patch_job_data(pk, serializer.data, action)
+
+                    record_index += 1
+                    total_duration = round(end_time, 2)
+                    if 599.9 <= total_duration <= 600:
+                        break
+
+        else:
+            anno_paths = glob(osp.join(temp_dir, "**", "*.tsv"), recursive=True)
+            for p in anno_paths:
+                load_anno(p, instance_data)