diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py
index e38673f5cee1..05e0e5dc4f0f 100644
--- a/cvat/apps/dataset_manager/task.py
+++ b/cvat/apps/dataset_manager/task.py
@@ -11,6 +11,7 @@
 import os
 import json
 import zipfile
+from pydub import AudioSegment
 from scipy.io import wavfile
 import numpy as np
 from collections import OrderedDict
@@ -19,9 +20,10 @@
 from tempfile import TemporaryDirectory
 from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError
 
-from django.conf import settings
+# from django.conf import settings
 from django.db import transaction
 from django.db.models.query import Prefetch
+from cvat.apps.engine.models import Job, AttributeSpec
 from django.utils import timezone
 from rest_framework.exceptions import ValidationError
 
@@ -29,7 +31,6 @@
 from cvat.apps.engine.plugins import plugin_decorator
 from cvat.apps.events.handlers import handle_annotations_change
 from cvat.apps.profiler import silk_profile
-from cvat.apps.engine.cache import MediaCache
 from cvat.apps.engine.frame_provider import FrameProvider
 from cvat.apps.dataset_manager.annotation import AnnotationIR, AnnotationManager
 from cvat.apps.dataset_manager.bindings import TaskData, JobData, CvatImportError
@@ -878,9 +879,6 @@ def jobChunkPathGetter(db_data, start, stop, task_dimension, data_quality, data_
     # db_data = Task Data
     frame_provider = FrameProvider(db_data, task_dimension)
 
-    start_chunk = frame_provider.get_chunk_number(start)
-    stop_chunk = frame_provider.get_chunk_number(stop)
-
     # self.type = data_type
     number = int(data_num) if data_num is not None else None
 
@@ -894,15 +892,20 @@ def jobChunkPathGetter(db_data, start, stop, task_dimension, data_quality, data_
 
     return path
 
-def chunk_annotation_audio(audio_file, output_folder, annotations):
-    # Load audio
-    # y, sr = librosa.load(audio_file, sr=None)
-    sr, y = wavfile.read(audio_file)
+def chunk_annotation_audio(concat_array, output_folder, annotations):
+    # Convert NumPy array to AudioSegment
+    sr = 44100 # sampling rate
+    audio_segment = AudioSegment(concat_array.tobytes(), frame_rate=sr, channels=1, sample_width=4)
+
+    try:
+        y = audio_segment.get_array_of_samples()
+    except Exception as e:
+        return None
 
     data = []
-    # Loop over shapes
+
     for i, shape in enumerate(annotations, 1):
-        # Extract transcript and time points
+
         start_time = min(shape['points'][:2])
         end_time = max(shape['points'][2:])
 
@@ -914,14 +917,11 @@ def chunk_annotation_audio(audio_file, output_folder, annotations):
         chunk = y[start_sample:end_sample]
 
         clip_uuid = str(uuid.uuid4())
-        # Save the chunk with transcript as filename
-        output_file = os.path.join(output_folder, f"{clip_uuid}.wav")
+        output_file = os.path.join(output_folder, f"{clip_uuid}.mp3")
         soundfile.write(output_file, chunk, sr)
 
         data.append(output_file)
 
-        # logger.info(f"Annotation {str(i)} Chunk saved: {output_file}")
-
     return data
 
 def create_annotation_clips_zip(annotation_audio_chunk_file_paths, meta_data_file_path, output_folder, dst_file):
@@ -959,7 +959,6 @@ def get_np_audio_array_from_job(job_id):
 
     job_data_chunk_size = job.db_job.segment.task.data.chunk_size
     task_dimension = job.db_job.segment.task.dimension
-    storage_method = job.db_job.segment.task.data.storage_method
 
     start = job.start_frame/job_data_chunk_size
     stop = job.stop_frame/job_data_chunk_size
@@ -967,7 +966,7 @@ def get_np_audio_array_from_job(job_id):
     audio_array_buffer = []
     for i in range(math.trunc(start), math.trunc(stop)+1):
         db_job = job.db_job
-        data_type = "chunk"
+        # data_type = "chunk"
         data_num = i
         data_quality = 'compressed'
 
@@ -993,30 +992,68 @@ def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
     # All Annotations
     annotations = job.data["shapes"]
 
-    audio_file_path = os.path.join(temp_dir, str(job_id) + ".wav")
-    with wave.open(audio_file_path, 'wb') as wave_file:
-        wave_file.setnchannels(1)
-        wave_file.setsampwidth(4)
-        wave_file.setframerate(44100)
-        wave_file.writeframes(concat_array)
+    # Job detail
 
-    annotation_audio_chunk_file_paths = chunk_annotation_audio(audio_file_path, temp_dir, annotations)
+    # Find labels of a particular job
+    job_details = Job.objects.get(id=job_id)
+    labels_queryset = job_details.get_labels()
+    labels_list = list(labels_queryset.values())
 
-    for i in range(0, len(annotation_audio_chunk_file_paths)):
-        final_data.append({"path" : os.path.basename(annotation_audio_chunk_file_paths[i]), "sentence" : annotations[i]["transcript"], "age" : annotations[i]["age"], "gender" : annotations[i]["gender"], "accents" : annotations[i]["accent"], "locale" : annotations[i]["locale"], "emotion" : annotations[i]["emotion"] })
+    labels_mapping = {}
+
+    for label in labels_list:
+        labels_mapping[label["id"]] = label
+
+        label_attributes_queryset = AttributeSpec.objects.filter(label=label["id"])
+
+        attributes_list = list(label_attributes_queryset.values())
 
+        labels_mapping[label["id"]]["attributes"] = {}
+
+        for attribute in attributes_list:
+            labels_mapping[label["id"]]["attributes"][attribute["id"]] = attribute
+
+        slogger.glob.debug("JOB LABELS ATTRIBUTES")
+        slogger.glob.debug(json.dumps(attributes_list))
+
+
+    slogger.glob.debug("JOB LABELS")
+    slogger.glob.debug(json.dumps(labels_list))
+
+    # audio_file_path = os.path.join(temp_dir, str(job_id) + ".wav")
+    # with wave.open(audio_file_path, 'wb') as wave_file:
+    #     wave_file.setnchannels(1)
+    #     wave_file.setsampwidth(4)
+    #     wave_file.setframerate(44100)
+    #     wave_file.writeframes(concat_array)
+
+    annotation_audio_chunk_file_paths = chunk_annotation_audio(concat_array, temp_dir, annotations)
+
+    for i in range(0, len(annotation_audio_chunk_file_paths)):
+        annotation_attribute_id = annotations[i]["attributes"][0]["spec_id"]
+        label_attributes = labels_mapping[annotations[i]["label_id"]]["attributes"]
+        annotation_attribute = label_attributes[annotation_attribute_id]
+        attribute_name = annotation_attribute["name"]
+        attribute_val = annotations[i]["attributes"][0]["value"]
+
+        final_data.append({"path" : os.path.basename(annotation_audio_chunk_file_paths[i]), "sentence" : annotations[i]["transcript"], "age" : annotations[i]["age"], "gender" : annotations[i]["gender"], "accents" : annotations[i]["accent"], "locale" : annotations[i]["locale"], "emotion" : annotations[i]["emotion"], "label" : labels_mapping[annotations[i]["label_id"]]["name"], "attribute_name" : attribute_name, "attribute_value" : attribute_val, "start" : annotations[i]["points"][0],  "end" : annotations[i]["points"][3]})
+
+    slogger.glob.debug("JOB ANNOTATION DATA")
+    slogger.glob.debug(json.dumps(final_data))
+    slogger.glob.debug("All  ANNOTATIONs DATA")
+    slogger.glob.debug(json.dumps(annotations))
     return final_data, annotation_audio_chunk_file_paths
 
 def convert_annotation_data_format(data, format_name):
     if format_name == "Common Voice":
         return data
     elif format_name == "Librispeech":
-        data = list(map(lambda x: {"chapter_id" : "", "file" : x["path"], "id" : str(uuid.uuid4()), "speaker_id" : "", "text" : x["sentence"]}, data))
+        data = list(map(lambda x: {"chapter_id" : "", "file" : x["path"], "id" : str(uuid.uuid4()), "speaker_id" : "", "text" : x["sentence"], "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))
     elif format_name == "VoxPopuli":
         language_id_mapping = {"en" : 0}
-        data = list(map(lambda x: {"audio_id" : str(uuid.uuid4()), "language" : language_id_mapping[x["locale"]] if language_id_mapping.get(x["locale"]) else None, "audio_path" : x["path"], "raw_text" : x["sentence"], "normalized_text" : x["sentence"], "gender" : x["gender"], "speaker_id" : "", "is_gold_transcript" : False, "accent" : x["accent"]}, data))
+        data = list(map(lambda x: {"audio_id" : str(uuid.uuid4()), "language" : language_id_mapping[x["locale"]] if language_id_mapping.get(x["locale"]) else None, "audio_path" : x["path"], "raw_text" : x["sentence"], "normalized_text" : x["sentence"], "gender" : x["gender"], "speaker_id" : "", "is_gold_transcript" : False, "accent" : x["accents"], "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))
     elif format_name == "Ted-Lium":
-        data = list(map(lambda x: {"file" : x["path"], "text" : x["sentence"], "gender" : x["gender"], "id" : str(uuid.uuid4()), "speaker_id" : ""}, data))
+        data = list(map(lambda x: {"file" : x["path"], "text" : x["sentence"], "gender" : x["gender"], "id" : str(uuid.uuid4()), "speaker_id" : "", "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))
 
     return data
 def export_audino_job(job_id, dst_file, format_name, server_url=None, save_images=False):
@@ -1074,6 +1111,9 @@ def export_audino_task(task_id, dst_file, format_name, server_url=None, save_ima
 
             final_data, annotation_audio_chunk_file_paths = get_audio_job_export_data(job.db_job.id, dst_file, job, temp_dir_base, temp_dir)
 
+            # Convert the data into a format
+            final_data = convert_annotation_data_format(final_data, format_name)
+
             final_task_data.append(final_data)
             final_annotation_chunk_paths.append(annotation_audio_chunk_file_paths)
 
diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py
index c30365ef6b51..962176b66194 100644
--- a/cvat/apps/engine/backup.py
+++ b/cvat/apps/engine/backup.py
@@ -179,6 +179,7 @@ class _TaskBackupBase(_BackupBase):
     def _prepare_task_meta(self, task):
         allowed_fields = {
             'name',
+            'segment_duration',
             'bug_tracker',
             'status',
             'subset',
@@ -232,6 +233,12 @@ def _prepare_annotations(self, annotations, label_mapping):
             'attributes',
             'shapes',
             'elements',
+            'gender',
+            'age',
+            'accent',
+            'transcript',
+            'locale',
+            'emotion'
         }
 
         def _update_attribute(attribute, label):
@@ -328,6 +335,7 @@ def __init__(self, pk, version=Version.V1):
         self._db_task = models.Task.objects.prefetch_related('data__images', 'annotation_guide__assets').select_related('data__video', 'annotation_guide').get(pk=pk)
         self._db_data = self._db_task.data
         self._version = version
+        self.logger = slogger.task[pk]
 
         db_labels = (self._db_task.project if self._db_task.project_id else self._db_task).label_set.all().prefetch_related(
             'attributespec_set')
@@ -382,6 +390,8 @@ def _write_task(self, zip_object, target_dir=None):
     def _write_manifest(self, zip_object, target_dir=None):
         def serialize_task():
             task_serializer = TaskReadSerializer(self._db_task)
+            # self.logger.info("WRITE MANIFEST")
+            # self.logger.info(task_serializer.data)
             for field in ('url', 'owner', 'assignee'):
                 task_serializer.fields.pop(field)
 
@@ -641,6 +651,8 @@ def _write_data(zip_object):
         jobs = self._manifest.pop('jobs')
 
         self._prepare_task_meta(self._manifest)
+        self._logger.info("DEBUG IMPORT")
+        self._logger.info(self._manifest)
         self._manifest['owner_id'] = self._user_id
         self._manifest['project_id'] = self._project_id
 
diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py
index 66c1a396ce20..9114e1033fd6 100644
--- a/cvat/apps/engine/media_extractors.py
+++ b/cvat/apps/engine/media_extractors.py
@@ -10,11 +10,12 @@
 import itertools
 import struct
 from enum import IntEnum
+import chardet
 from abc import ABC, abstractmethod
 from contextlib import closing
 from typing import Iterable
-from cvat.apps.engine.log import ServerLogManager
-slogger = ServerLogManager(__name__)
+# from cvat.apps.engine.log import ServerLogManager
+# slogger = ServerLogManager(__name__)
 
 import av
 import numpy as np
@@ -505,6 +506,26 @@ def _has_frame(self, i):
 
         return False
 
+    def get_total_frames(self):
+        total_frame = 0
+        with self._get_av_container() as container:
+            stream = container.streams.audio[0]
+            stream.thread_type = 'AUTO'
+            for packet in container.demux(stream):
+                for image in packet.decode():
+                    total_frame += 1
+
+        return total_frame
+
+    def get_file_encoding(self, file_path):
+
+        with open(file_path, 'rb') as f:
+            rawdata = f.read(1024)
+        result = chardet.detect(rawdata)
+        encoding = result['encoding']
+
+        return encoding
+
     def __iter__(self):
         with self._get_av_container() as container:
             stream = container.streams.audio[0]
@@ -523,7 +544,12 @@ def get_progress(self, pos):
     def _get_av_container(self):
         if isinstance(self._source_path[0], io.BytesIO):
             self._source_path[0].seek(0) # required for re-reading
-        return av.open(self._source_path[0])
+
+        encoding = self.get_file_encoding(self._source_path[0])
+        if encoding:
+            return av.open(self._source_path[0], metadata_encoding = encoding)
+        else:
+            return av.open(self._source_path[0])
 
     def _get_duration(self):
         with self._get_av_container() as container:
@@ -543,25 +569,24 @@ def _get_duration(self):
 
     def get_preview(self, frame):
         with self._get_av_container() as container:
-            stream = container.streams.video[0]
+            stream = container.streams.audio[0]
             tb_denominator = stream.time_base.denominator
             needed_time = int((frame / stream.guessed_rate) * tb_denominator)
             container.seek(offset=needed_time, stream=stream)
             for packet in container.demux(stream):
                 for frame in packet.decode():
                     return self._get_preview(frame.to_image() if not stream.metadata.get('rotate') \
-                        else av.VideoFrame().from_ndarray(
+                        else av.AudioFrame().from_ndarray(
                             rotate_image(
                                 frame.to_ndarray(format='bgr24'),
-                                360 - int(container.streams.video[0].metadata.get('rotate'))
+                                360 - int(container.streams.audio[0].metadata.get('rotate'))
                             ),
                             format ='bgr24'
                         ).to_image()
                     )
 
     def get_image_size(self, i):
-        image = (next(iter(self)))[0]
-        return image.width, image.height
+        return 1, 1
 
 class FragmentMediaReader:
     def __init__(self, chunk_number, chunk_size, start, stop, step=1):
@@ -953,44 +978,6 @@ def save_as_chunk(self, images, chunk_path):
             self._encode_images(images, output_container, output_v_stream)
         return [(input_w, input_h)]
 
-class AudioCompressedChunkWriter(AudioChunkWriter):
-    def __init__(self, quality):
-        super().__init__(quality)
-        if self._codec_name == 'libx264':
-            self._codec_opts = {
-                'profile': 'baseline',
-                'coder': '0',
-                'crf': str(self._image_quality),
-                'wpredp': '0',
-                'flags': '-loop',
-            }
-
-    def save_as_chunk(self, images, chunk_path):
-        if not images:
-            raise Exception('no images to save')
-
-        input_w = images[0][0].width
-        input_h = images[0][0].height
-
-        downscale_factor = 1
-        while input_h / downscale_factor >= 1080:
-            downscale_factor *= 2
-
-        output_h = input_h // downscale_factor
-        output_w = input_w // downscale_factor
-
-        with av.open(chunk_path, 'w', format=self.FORMAT) as output_container:
-            output_v_stream = self._add_video_stream(
-                container=output_container,
-                w=output_w,
-                h=output_h,
-                rate=self._output_fps,
-                options=self._codec_opts,
-            )
-
-            self._encode_images(images, output_container, output_v_stream)
-        return [(input_w, input_h)]
-
 def _is_archive(path):
     mime = mimetypes.guess_type(path)
     mime_type = mime[0]
@@ -1043,18 +1030,18 @@ def _is_zip(path):
         'mode': 'annotation',
         'unique': False,
     },
-    'video': {
-        'has_mime_type': _is_video,
-        'extractor': VideoReader,
-        'mode': 'interpolation',
-        'unique': True,
-    },
     'audio': {
         'has_mime_type': _is_audio,
         'extractor': AudioReader,
         'mode': 'interpolation',
         'unique': False,
     },
+    'video': {
+        'has_mime_type': _is_video,
+        'extractor': VideoReader,
+        'mode': 'interpolation',
+        'unique': True,
+    },
     'archive': {
         'has_mime_type': _is_archive,
         'extractor': ArchiveReader,
diff --git a/cvat/apps/engine/migrations/0084_job_ai_audio_annotation_error_msg_and_more.py b/cvat/apps/engine/migrations/0084_job_ai_audio_annotation_error_msg_and_more.py
index d5683154d31d..a81f72ddd46a 100644
--- a/cvat/apps/engine/migrations/0084_job_ai_audio_annotation_error_msg_and_more.py
+++ b/cvat/apps/engine/migrations/0084_job_ai_audio_annotation_error_msg_and_more.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.6 on 2024-04-15 05:10
+# Generated by Django 4.2.6 on 2024-04-19 09:27
 
 import cvat.apps.engine.models
 from django.db import migrations, models
diff --git a/cvat/apps/engine/migrations/0085_alter_task_total_audio_duration.py b/cvat/apps/engine/migrations/0085_alter_task_total_audio_duration.py
new file mode 100644
index 000000000000..c99d69f9ea54
--- /dev/null
+++ b/cvat/apps/engine/migrations/0085_alter_task_total_audio_duration.py
@@ -0,0 +1,12 @@
+# Generated by Django 4.2.6 on 2024-04-19 09:30
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("engine", "0084_job_ai_audio_annotation_error_msg_and_more"),
+    ]
+
+    operations = [
+    ]
diff --git a/cvat/apps/engine/migrations/0086_task_segment_duration.py b/cvat/apps/engine/migrations/0086_task_segment_duration.py
new file mode 100644
index 000000000000..5568c3cad7bb
--- /dev/null
+++ b/cvat/apps/engine/migrations/0086_task_segment_duration.py
@@ -0,0 +1,17 @@
+# Generated by Django 4.2.6 on 2024-04-19 13:32
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("engine", "0085_alter_task_total_audio_duration"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="task",
+            name="segment_duration",
+            field=models.PositiveIntegerField(default=None, null=True),
+        ),
+    ]
diff --git a/cvat/apps/engine/migrations/0087_remove_task_total_audio_duration_and_more.py b/cvat/apps/engine/migrations/0087_remove_task_total_audio_duration_and_more.py
new file mode 100644
index 000000000000..cf68450032fe
--- /dev/null
+++ b/cvat/apps/engine/migrations/0087_remove_task_total_audio_duration_and_more.py
@@ -0,0 +1,17 @@
+# Generated by Django 4.2.6 on 2024-05-06 13:22
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("engine", "0086_task_segment_duration"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="task",
+            name="audio_total_duration",
+            field=models.PositiveIntegerField(default=None, null=True),
+        ),
+    ]
diff --git a/cvat/apps/engine/migrations/__init__.py b/cvat/apps/engine/migrations/__init__.py
index 27d2190951b4..1a8ef647b935 100644
--- a/cvat/apps/engine/migrations/__init__.py
+++ b/cvat/apps/engine/migrations/__init__.py
@@ -1,5 +1,3 @@
-
-# Copyright (C) 2018-2022 Intel Corporation
+# Copyright (C) 2024 CVAT.ai Corporation
 #
-# SPDX-License-Identifier: MIT
-
+# SPDX-License-Identifier: MIT
\ No newline at end of file
diff --git a/cvat/apps/engine/mixins.py b/cvat/apps/engine/mixins.py
index 81ee63ca5425..2c500d7c55b9 100644
--- a/cvat/apps/engine/mixins.py
+++ b/cvat/apps/engine/mixins.py
@@ -188,7 +188,7 @@ class UploadMixin:
         'Tus-Max-Size': _tus_max_file_size,
         'Access-Control-Allow-Origin': "*",
         'Access-Control-Allow-Methods': "PATCH,HEAD,GET,POST,OPTIONS",
-        'Access-Control-Expose-Headers': "Tus-Resumable,upload-length,upload-metadata,Location,Upload-Offset",
+        'Access-Control-Expose-Headers': "Tus-Resumable,upload-length,upload-metadata,Location,Upload-Offset,Upload-Filename",
         'Access-Control-Allow-Headers': "Tus-Resumable,upload-length,upload-metadata,Location,Upload-Offset,content-type",
         'Cache-Control': 'no-store'
     }
@@ -284,8 +284,8 @@ def init_tus_upload(self, request):
             tus_file = TusFile.create_file(metadata, file_size, self.get_upload_dir())
 
             location = request.build_absolute_uri()
-            if 'HTTP_X_FORWARDED_HOST' not in request.META:
-                location = request.META.get('HTTP_ORIGIN') + request.META.get('PATH_INFO')
+            # if 'HTTP_X_FORWARDED_HOST' not in request.META:
+            #     location = request.META.get('HTTP_ORIGIN') + request.META.get('PATH_INFO')
 
             if import_type in ('backup', 'annotations', 'datasets'):
                 scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.CLEANING.value)
diff --git a/cvat/apps/engine/models.py b/cvat/apps/engine/models.py
index 9efbea83daec..edd497b77b68 100644
--- a/cvat/apps/engine/models.py
+++ b/cvat/apps/engine/models.py
@@ -405,6 +405,7 @@ class Task(models.Model):
     project = models.ForeignKey(Project, on_delete=models.CASCADE,
         null=True, blank=True, related_name="tasks",
         related_query_name="task")
+    audio_total_duration = models.PositiveIntegerField(null=True, default=None)
     name = SafeCharField(max_length=256)
     mode = models.CharField(max_length=32)
     owner = models.ForeignKey(User, null=True, blank=True,
@@ -429,6 +430,7 @@ class Task(models.Model):
         blank=True, on_delete=models.SET_NULL, related_name='+')
     target_storage = models.ForeignKey('Storage', null=True, default=None,
         blank=True, on_delete=models.SET_NULL, related_name='+')
+    segment_duration = models.PositiveIntegerField(null=True, default=None)
 
     # Extend default permission model
     class Meta:
diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py
index 2964f1e7fae1..cbae86108983 100644
--- a/cvat/apps/engine/serializers.py
+++ b/cvat/apps/engine/serializers.py
@@ -635,6 +635,7 @@ def create(self, validated_data):
 
             size = task.data.size
             valid_frame_ids = task.data.get_valid_frame_indices()
+            segment_size = task.segment_size
 
             frame_selection_method = validated_data.pop("frame_selection_method", None)
             if frame_selection_method == models.JobFrameSelectionMethod.RANDOM_UNIFORM:
@@ -645,15 +646,36 @@ def create(self, validated_data):
                         f"the number of the task frames ({size})"
                     )
 
-                seed = validated_data.pop("seed", None)
+                num_segments = size // segment_size
+                jobs_frame_list = []
+                for i in range(num_segments):
+                    start = i * segment_size
+                    end  = (i+1) * segment_size - 1
+                    array = [j for j in range(start,end+1)]
+                    jobs_frame_list.append(array)
+
+                #  if there's a remainder, create the  last array
+                if size % segment_size != 0:
+                    start = num_segments * segment_size
+                    end  = size - 1
+                    array = [j for j in range(start,end+1)]
+                    jobs_frame_list.append(array)
+
+                #Random select from the list
+                import math, random
+                random_jobs_no = math.ceil(frame_count / segment_size)
+                selected_jobs_frames = random.sample(jobs_frame_list, random_jobs_no)
+                frames = sorted([item for sublist in selected_jobs_frames for item in sublist])
+
+                # seed = validated_data.pop("seed", None)
 
                 # The RNG backend must not change to yield reproducible results,
                 # so here we specify it explicitly
-                from numpy import random
-                rng = random.Generator(random.MT19937(seed=seed))
-                frames = rng.choice(
-                    list(valid_frame_ids), size=frame_count, shuffle=False, replace=False
-                ).tolist()
+                # from numpy import random
+                # rng = random.Generator(random.MT19937(seed=seed))
+                # frames = rng.choice(
+                #     list(valid_frame_ids), size=frame_count, shuffle=False, replace=False
+                # ).tolist()
             elif frame_selection_method == models.JobFrameSelectionMethod.MANUAL:
                 frames = validated_data.pop("frames")
 
@@ -1049,6 +1071,7 @@ class TaskReadSerializer(serializers.ModelSerializer):
     dimension = serializers.CharField(allow_blank=True, required=False)
     target_storage = StorageSerializer(required=False, allow_null=True)
     source_storage = StorageSerializer(required=False, allow_null=True)
+    segment_duration = serializers.IntegerField(allow_null=True)
     jobs = JobsSummarySerializer(url_filter_key='task_id', source='segment_set')
     labels = LabelsSummarySerializer(source='*')
 
@@ -1058,7 +1081,7 @@ class Meta:
             'bug_tracker', 'created_date', 'updated_date', 'overlap', 'segment_size',
             'status', 'data_chunk_size', 'data_compressed_chunk_type', 'guide_id',
             'data_original_chunk_type', 'size', 'image_quality', 'data', 'dimension',
-            'subset', 'organization', 'target_storage', 'source_storage', 'jobs', 'labels',
+            'subset', 'organization', 'target_storage', 'source_storage', 'segment_duration', 'jobs', 'labels',
         )
         read_only_fields = fields
         extra_kwargs = {
@@ -1074,12 +1097,13 @@ class TaskWriteSerializer(WriteOnceMixin, serializers.ModelSerializer):
     project_id = serializers.IntegerField(required=False, allow_null=True)
     target_storage = StorageSerializer(required=False, allow_null=True)
     source_storage = StorageSerializer(required=False, allow_null=True)
+    segment_duration = serializers.IntegerField(required=False, allow_null=True)
 
     class Meta:
         model = models.Task
         fields = ('url', 'id', 'name', 'project_id', 'owner_id', 'assignee_id',
             'bug_tracker', 'overlap', 'segment_size', 'labels', 'subset',
-            'target_storage', 'source_storage',
+            'target_storage', 'source_storage', 'segment_duration'
         )
         write_once_fields = ('overlap', 'segment_size')
 
diff --git a/cvat/apps/engine/task.py b/cvat/apps/engine/task.py
index 68dd8834cead..5d7704b873fc 100644
--- a/cvat/apps/engine/task.py
+++ b/cvat/apps/engine/task.py
@@ -2,7 +2,8 @@
 # Copyright (C) 2022-2023 CVAT.ai Corporation
 #
 # SPDX-License-Identifier: MIT
-
+import av
+import math
 import itertools
 import fnmatch
 import os
@@ -27,7 +28,7 @@
 
 from cvat.apps.engine import models
 from cvat.apps.engine.log import ServerLogManager
-from cvat.apps.engine.media_extractors import (MEDIA_TYPES, ImageListReader, Mpeg4ChunkWriter, Mpeg4CompressedChunkWriter, AudioChunkWriter, AudioCompressedChunkWriter,
+from cvat.apps.engine.media_extractors import (MEDIA_TYPES, ImageListReader, Mpeg4ChunkWriter, Mpeg4CompressedChunkWriter, AudioChunkWriter,
     ValidateDimension, ZipChunkWriter, ZipCompressedChunkWriter, get_mime, sort)
 from cvat.apps.engine.utils import av_scan_paths,get_rq_job_meta, define_dependent_job, get_rq_lock_by_user, preload_images
 from cvat.utils.http import make_requests_session, PROXIES_FOR_UNTRUSTED_URLS
@@ -111,8 +112,43 @@ def _get_task_segment_data(
     *,
     data_size: Optional[int] = None,
     job_file_mapping: Optional[JobFileMapping] = None,
+    segment_duration: Optional[int] = None
 ) -> SegmentsParams:
-    if job_file_mapping is not None:
+
+    # segment_duration = 3000
+
+    slogger.glob.debug("Segment Duration")
+    slogger.glob.debug(segment_duration)
+    if segment_duration is not None:
+        # Total audio duration in milliseconds
+        audio_total_duration = (db_task.audio_total_duration)
+
+        if audio_total_duration == 0:
+            return SegmentsParams(iter([]), 0, 0)
+
+        num_segments = max(1, math.ceil(audio_total_duration / segment_duration))
+
+        slogger.glob.debug("Num segments")
+        slogger.glob.debug(num_segments)
+
+        slogger.glob.debug("Num frames")
+        slogger.glob.debug(db_task.data.size)
+
+        def _segments():
+            start_time = 0
+            for _ in range(num_segments):
+                stop_time = start_time + segment_duration - 1
+                yield SegmentParams(start_time, stop_time)
+                start_time = stop_time + 1
+
+        segments = _segments()
+        slogger.glob.debug("Segment length")
+        slogger.glob.debug(len(segments))
+        segment_size = 0
+        overlap = 0
+
+    elif job_file_mapping is not None:
+
         def _segments():
             # It is assumed here that files are already saved ordered in the task
             # Here we just need to create segments by the job sizes
@@ -134,6 +170,13 @@ def _segments():
 
         segment_size = db_task.segment_size
         segment_step = segment_size
+
+        if segment_size == 0:
+            raise ValueError("Segment size cannot be zero.")
+
+        slogger.glob.debug(data_size)
+        slogger.glob.debug(segment_size)
+        slogger.glob.debug(segment_step)
         if segment_size == 0 or segment_size > data_size:
             segment_size = data_size
 
@@ -141,7 +184,8 @@ def _segments():
             # Otherwise a task contains an extra segment
             segment_step = sys.maxsize
 
-        overlap = 5 if db_task.mode == 'interpolation' else 0
+        # overlap = 5 if db_task.mode == 'interpolation' else 0
+        overlap = 0
         if db_task.overlap is not None:
             overlap = min(db_task.overlap, segment_size  // 2)
 
@@ -210,6 +254,9 @@ def _count_files(data):
     def count_files(file_mapping, counter):
         for rel_path, full_path in file_mapping.items():
             mime = get_mime(full_path)
+
+            slogger.glob.debug("Mimetype")
+            slogger.glob.debug(mime)
             if mime in counter:
                 counter[mime].append(rel_path)
             elif rel_path.endswith('.jsonl'):
@@ -651,6 +698,7 @@ def _create_thread(
 
     # count and validate uploaded files
     media = _count_files(data)
+
     media, task_mode = _validate_data(media, manifest_files)
 
     if job_file_mapping is not None and task_mode != 'annotation':
@@ -880,7 +928,7 @@ def update_progress(progress):
         if not hasattr(update_progress, 'call_counter'):
             update_progress.call_counter = 0
 
-        status_message = 'CVAT is preparing data chunks'
+        status_message = 'Audino is preparing data chunks'
         if not progress:
             status_message = '{} {}'.format(status_message, progress_animation[update_progress.call_counter])
         job.meta['status'] = status_message
@@ -909,19 +957,113 @@ def update_progress(progress):
     compressed_chunk_writer = compressed_chunk_writer_class(db_data.image_quality, **kwargs)
     original_chunk_writer = original_chunk_writer_class(original_quality, **kwargs)
 
-    # calculate chunk size if it isn't specified
-    if db_data.chunk_size is None:
-        if isinstance(compressed_chunk_writer, ZipCompressedChunkWriter):
-            if not is_data_in_cloud:
-                w, h = extractor.get_image_size(0)
-            else:
-                img_properties = manifest[0]
-                w, h = img_properties['width'], img_properties['height']
-            area = h * w
-            db_data.chunk_size = max(2, min(72, 36 * 1920 * 1080 // area))
+    def get_file_encoding(file_path):
+        import chardet
+
+        with open(file_path, 'rb') as f:
+            rawdata = f.read(1024)
+        result = chardet.detect(rawdata)
+        encoding = result['encoding']
+
+        return encoding
+    def get_audio_duration(file_path):
+        encoding=get_file_encoding(file_path)
+        slogger.glob.debug("ENCODING")
+        slogger.glob.debug(encoding)
+        # Open the audio file
+        if encoding:
+            container = av.open(file_path, metadata_encoding=encoding)
         else:
-            db_data.chunk_size = 36
+            container = av.open(file_path)
+
+        # Get the first audio stream
+        audio_stream = next((stream for stream in container.streams if stream.codec.type == 'audio'), None)
+
+        if not audio_stream:
+            print("Error: No audio stream found in the file.")
+            return None
+
+        # Get the duration in seconds based on stream information
+        duration_milliseconds = int(audio_stream.duration * audio_stream.time_base * 1000)
+
+        slogger.glob.debug("FFF AUDIO DURATION")
+        slogger.glob.debug(audio_stream.duration)
+
+        slogger.glob.debug("PPP AUDIO DURATION")
+        slogger.glob.debug(audio_stream.time_base)
+
+        # Close the container
+        container.close()
+
+        return duration_milliseconds
+
+
+    db_task.audio_total_duration = None
+
+    if MEDIA_TYPE == "audio":
+
+        slogger.glob.debug("Before segment_duration")
+        slogger.glob.debug(db_task.segment_duration)
+
+        segment_duration = db_task.segment_duration if db_task.segment_duration is not None else 600000
+        db_task.audio_total_duration = get_audio_duration(details['source_path'][0])
+        # db_task.data.audio_total_duration = 720000 #get_audio_duration(details['source_path'][0])
+        total_audio_frames = extractor.get_total_frames()
+
+        slogger.glob.debug("TOTAL AUDIO DURATION")
+        slogger.glob.debug(db_task.audio_total_duration)
+
+        num_frames_per_millisecond = total_audio_frames / db_task.audio_total_duration
+
+        if segment_duration == 0:
+            segment_duration = db_task.audio_total_duration
+            # db_task.segment_size = 0
+            # db_data.chunk_size = db_task.audio_total_duration*num_frames_per_millisecond
+        # else:
+
+        slogger.glob.debug("num_frames_per_millisecond")
+        slogger.glob.debug(num_frames_per_millisecond)
+
+        slogger.glob.debug("segment_duration")
+        slogger.glob.debug(segment_duration)
+
+        num_frames_per_segment_duration = num_frames_per_millisecond*segment_duration
+        db_task.segment_size = int(round(num_frames_per_segment_duration))
+
+        num_segments = max(1, int(math.ceil(db_task.audio_total_duration / segment_duration)))
+
+        slogger.glob.debug("Segment Size Before")
+        slogger.glob.debug(db_task.segment_size)
+
+        slogger.glob.debug("Segment Size After")
+        slogger.glob.debug(db_task.segment_size)
+
+        slogger.glob.debug("Num segments")
+        slogger.glob.debug(num_segments)
+
+        slogger.glob.debug("Num frames")
+        slogger.glob.debug(total_audio_frames)
+
+        slogger.glob.debug("Audio Duration")
+        slogger.glob.debug(db_task.audio_total_duration)
+
+        # Default chunk size = entire frames
+        db_data.chunk_size = db_task.segment_size #db_task.data.size
+    else:
+        if db_data.chunk_size is None:
+            if isinstance(compressed_chunk_writer, ZipCompressedChunkWriter):
+                if not is_data_in_cloud:
+                    w, h = extractor.get_image_size(0)
+                else:
+                    img_properties = manifest[0]
+                    w, h = img_properties['width'], img_properties['height']
+                area = h * w
+                db_data.chunk_size = max(2, min(72, 36 * 1920 * 1080 // area))
+            else:
+                db_data.chunk_size = 36
 
+    slogger.glob.debug("OPPPPP CHUNK SIZE")
+    slogger.glob.debug(db_data.chunk_size)
     video_path = ""
     video_size = (0, 0)
 
diff --git a/cvat/apps/engine/templates/audio_annotation/annotation_message.html b/cvat/apps/engine/templates/audio_annotation/annotation_message.html
new file mode 100644
index 000000000000..172a7de8f14c
--- /dev/null
+++ b/cvat/apps/engine/templates/audio_annotation/annotation_message.html
@@ -0,0 +1,176 @@
+<!DOCTYPE html>
+{% load account %}{% user_display user as user_display %}{% load i18n %}{% autoescape off %}
+{% load static %}
+
+<html>
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="x-ua-compatible" content="ie=edge">
+  <title>Annotation Job Completed</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <style type="text/css">
+  @media screen {
+    @font-face {
+      font-family: 'Source Sans Pro';
+      font-style: normal;
+      font-weight: 400;
+      src: local('Source Sans Pro Regular'), local('SourceSansPro-Regular'), url(https://fonts.gstatic.com/s/sourcesanspro/v10/ODelI1aHBYDBqgeIAH2zlBM0YzuT7MdOe03otPbuUS0.woff) format('woff');
+    }
+    @font-face {
+      font-family: 'Source Sans Pro';
+      font-style: normal;
+      font-weight: 700;
+      src: local('Source Sans Pro Bold'), local('SourceSansPro-Bold'), url(https://fonts.gstatic.com/s/sourcesanspro/v10/toadOcfmlt9b38dHJxOBGFkQc6VGVFSmCnC_l7QZG60.woff) format('woff');
+    }
+  }
+  body,
+  table,
+  td,
+  a {
+    -ms-text-size-adjust: 100%;
+    -webkit-text-size-adjust: 100%;
+  }
+  table,
+  td {
+    mso-table-rspace: 0pt;
+    mso-table-lspace: 0pt;
+  }
+  img {
+    -ms-interpolation-mode: bicubic;
+  }
+  a[x-apple-data-detectors] {
+    font-family: inherit !important;
+    font-size: inherit !important;
+    font-weight: inherit !important;
+    line-height: inherit !important;
+    color: inherit !important;
+    text-decoration: none !important;
+  }
+  div[style*="margin: 16px 0;"] {
+    margin: 0 !important;
+  }
+  body {
+    width: 100% !important;
+    height: 100% !important;
+    padding: 0 !important;
+    margin: 0 !important;
+  }
+  table {
+    border-collapse: collapse !important;
+  }
+  a {
+    color: #ffffff;
+    text-decoration: none;
+  }
+  img {
+    height: auto;
+    line-height: 100%;
+    text-decoration: none;
+    border: 0;
+    outline: none;
+  }
+  </style>
+</head>
+<body style="background-color: #e9ecef;">
+
+  <table border="0" cellpadding="0" cellspacing="0" width="100%">
+    <tr>
+      <td align="center" bgcolor="#e9ecef">
+        <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
+          <tr>
+            <td align="center" valign="bottom">
+              <a href="https://audino.in/" target="_blank" style="display: inline-block;">
+                <img
+                  src="https://raw.githubusercontent.com/midas-research/audino/add-docs/docs/assets/banner.png"
+                  alt="Logo"
+                  border="0"
+                  width="100%"
+                  style="display: block; width: 100%;"
+                >
+              </a>
+            </td>
+          </tr>
+        </table>
+      </td>
+    </tr>
+
+    <tr>
+      <td align="center" bgcolor="#e9ecef">
+        <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
+          <tr>
+            <td align="left" bgcolor="#ffffff" style="padding: 36px 24px 0; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; border-top: 3px solid #d4dadf;">
+              <h1 style="margin: 0; font-size: 32px; font-weight: 700; letter-spacing: -1px; line-height: 48px;">
+                Your annotation job is completed
+              </h1>
+            </td>
+          </tr>
+        </table>
+      </td>
+    </tr>
+
+    <tr>
+      <td align="center" bgcolor="#e9ecef">
+        <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
+          <tr>
+            <td align="left" bgcolor="#ffffff" style="padding: 24px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 16px; line-height: 24px;">
+              {% blocktrans %}
+              <p>Dear {{ username }},</p>
+              <p>Your audio annotation job (ID: {{ job_id }}) has been completed.</p>
+              <p>You can check the results at the following link:</p>
+              {% endblocktrans %}
+            </td>
+          </tr>
+
+          <tr>
+            <td align="left" bgcolor="#ffffff">
+              <table border="0" cellpadding="0" cellspacing="0" width="100%">
+                <tr>
+                  <td align="center" bgcolor="#ffffff" style="padding: 12px;">
+                    <table border="0" cellpadding="0" cellspacing="0">
+                      <tr>
+                        <td align="center" bgcolor="#FCA836" style="border-radius: 6px;">
+                          {% blocktrans %}
+                          <a
+                            href="{{ protocol }}://{{ domain }}/jobs?page=1"
+                            target="_blank"
+                            style="display: inline-block; padding: 16px 36px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 16px; color: #ffffff; text-decoration: none; border-radius: 6px;"
+                            rel="noopener noreferrer"
+                          >
+                            View Job Details
+                          </a>
+                          {% endblocktrans %}
+                        </td>
+                      </tr>
+                    </table>
+                  </td>
+                </tr>
+              </table>
+            </td>
+          </tr>
+
+          <tr>
+            <td align="center" bgcolor="#ffffff" style="padding: 24px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 16px; line-height: 24px; border-bottom: 3px solid #d4dadf;">
+              <hr>
+              {% blocktrans %}
+              <p style="margin: 0;">Thank you for using our service!</p>
+              {% endblocktrans %} {% endautoescape %}
+            </td>
+          </tr>
+        </table>
+      </td>
+    </tr>
+
+    <tr>
+      <td align="center" bgcolor="#e9ecef" style="padding: 24px;">
+        <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
+          <tr>
+            <td align="center" bgcolor="#e9ecef" style="padding: 12px 24px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 14px; line-height: 20px; color: #666;">
+              <p style="margin: 0;">If you didn't request this, please ignore this email.</p>
+            </td>
+          </tr>
+        </table>
+      </td>
+    </tr>
+  </table>
+</body>
+</html>
diff --git a/cvat/apps/engine/templates/audio_annotation/annotation_subject.txt b/cvat/apps/engine/templates/audio_annotation/annotation_subject.txt
new file mode 100644
index 000000000000..749732ec110d
--- /dev/null
+++ b/cvat/apps/engine/templates/audio_annotation/annotation_subject.txt
@@ -0,0 +1,4 @@
+{% load i18n %}
+{% autoescape off %}
+{% blocktrans %} Your Audio Annotation Job Has Been Successfully Completed!{% endblocktrans %}
+{% endautoescape %}
diff --git a/cvat/apps/engine/templates/audio_annotation/error_message.html b/cvat/apps/engine/templates/audio_annotation/error_message.html
new file mode 100644
index 000000000000..386aae756754
--- /dev/null
+++ b/cvat/apps/engine/templates/audio_annotation/error_message.html
@@ -0,0 +1,177 @@
+<!DOCTYPE html>
+{% load account %}{% user_display user as user_display %}{% load i18n %}{% autoescape off %}
+{% load static %}
+
+<html>
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="x-ua-compatible" content="ie=edge">
+  <title>Annotation Job Error</title>
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <style type="text/css">
+  @media screen {
+    @font-face {
+      font-family: 'Source Sans Pro';
+      font-style: normal;
+      font-weight: 400;
+      src: local('Source Sans Pro Regular'), local('SourceSansPro-Regular'), url(https://fonts.gstatic.com/s/sourcesanspro/v10/ODelI1aHBYDBqgeIAH2zlBM0YzuT7MdOe03otPbuUS0.woff) format('woff');
+    }
+    @font-face {
+      font-family: 'Source Sans Pro';
+      font-style: normal;
+      font-weight: 700;
+      src: local('Source Sans Pro Bold'), local('SourceSansPro-Bold'), url(https://fonts.gstatic.com/s/sourcesanspro/v10/toadOcfmlt9b38dHJxOBGFkQc6VGVFSmCnC_l7QZG60.woff) format('woff');
+    }
+  }
+  body,
+  table,
+  td,
+  a {
+    -ms-text-size-adjust: 100%;
+    -webkit-text-size-adjust: 100%;
+  }
+  table,
+  td {
+    mso-table-rspace: 0pt;
+    mso-table-lspace: 0pt;
+  }
+  img {
+    -ms-interpolation-mode: bicubic;
+  }
+  a[x-apple-data-detectors] {
+    font-family: inherit !important;
+    font-size: inherit !important;
+    font-weight: inherit !important;
+    line-height: inherit !important;
+    color: inherit !important;
+    text-decoration: none !important;
+  }
+  div[style*="margin: 16px 0;"] {
+    margin: 0 !important;
+  }
+  body {
+    width: 100% !important;
+    height: 100% !important;
+    padding: 0 !important;
+    margin: 0 !important;
+  }
+  table {
+    border-collapse: collapse !important;
+  }
+  a {
+    color: #ffffff;
+    text-decoration: none;
+  }
+  img {
+    height: auto;
+    line-height: 100%;
+    text-decoration: none;
+    border: 0;
+    outline: none;
+  }
+  </style>
+</head>
+<body style="background-color: #e9ecef;">
+
+  <table border="0" cellpadding="0" cellspacing="0" width="100%">
+    <tr>
+      <td align="center" bgcolor="#e9ecef">
+        <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
+          <tr>
+            <td align="center" valign="bottom">
+              <a href="https://audino.in/" target="_blank" style="display: inline-block;">
+                <img
+                  src="https://raw.githubusercontent.com/midas-research/audino/add-docs/docs/assets/banner.png"
+                  alt="Logo"
+                  border="0"
+                  width="100%"
+                  style="display: block; width: 100%;"
+                >
+              </a>
+            </td>
+          </tr>
+        </table>
+      </td>
+    </tr>
+
+    <tr>
+      <td align="center" bgcolor="#e9ecef">
+        <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
+          <tr>
+            <td align="left" bgcolor="#ffffff" style="padding: 36px 24px 0; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; border-top: 3px solid #d4dadf;">
+              <h1 style="margin: 0; font-size: 32px; font-weight: 700; letter-spacing: -1px; line-height: 48px;">
+                Error with your annotation job
+              </h1>
+            </td>
+          </tr>
+        </table>
+      </td>
+    </tr>
+
+    <tr>
+      <td align="center" bgcolor="#e9ecef">
+        <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
+          <tr>
+            <td align="left" bgcolor="#ffffff" style="padding: 24px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 16px; line-height: 24px;">
+              {% blocktrans %}
+              <p>Dear {{ username }},</p>
+              <p>Unfortunately, there was an error while processing your audio annotation job (ID: {{ job_id }}).</p>
+              <p>Error details: {{ error }}</p>
+              <p>You can check the status at the following link:</p>
+              {% endblocktrans %}
+            </td>
+          </tr>
+
+          <tr>
+            <td align="left" bgcolor="#ffffff">
+              <table border="0" cellpadding="0" cellspacing="0" width="100%">
+                <tr>
+                  <td align="center" bgcolor="#ffffff" style="padding: 12px;">
+                    <table border="0" cellpadding="0" cellspacing="0">
+                      <tr>
+                        <td align="center" bgcolor="#FCA836" style="border-radius: 6px;">
+                          {% blocktrans %}
+                          <a
+                            href="{{ protocol }}://{{ domain }}/jobs?page=1"
+                            target="_blank"
+                            style="display: inline-block; padding: 16px 36px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 16px; color: #ffffff; text-decoration: none; border-radius: 6px;"
+                            rel="noopener noreferrer"
+                          >
+                            View Job Details
+                          </a>
+                          {% endblocktrans %}
+                        </td>
+                      </tr>
+                    </table>
+                  </td>
+                </tr>
+              </table>
+            </td>
+          </tr>
+
+          <tr>
+            <td align="center" bgcolor="#ffffff" style="padding: 24px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 16px; line-height: 24px; border-bottom: 3px solid #d4dadf;">
+              <hr>
+              {% blocktrans %}
+              <p style="margin: 0;">We apologize for the inconvenience and are working to resolve the issue.</p>
+              {% endblocktrans %} {% endautoescape %}
+            </td>
+          </tr>
+        </table>
+      </td>
+    </tr>
+
+    <tr>
+      <td align="center" bgcolor="#e9ecef" style="padding: 24px;">
+        <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
+          <tr>
+            <td align="center" bgcolor="#e9ecef" style="padding: 12px 24px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 14px; line-height: 20px; color: #666;">
+              <p style="margin: 0;">If you didn't request this, please ignore this email.</p>
+            </td>
+          </tr>
+        </table>
+      </td>
+    </tr>
+  </table>
+</body>
+</html>
diff --git a/cvat/apps/engine/templates/audio_annotation/error_subject.txt b/cvat/apps/engine/templates/audio_annotation/error_subject.txt
new file mode 100644
index 000000000000..8681f4ddd72c
--- /dev/null
+++ b/cvat/apps/engine/templates/audio_annotation/error_subject.txt
@@ -0,0 +1,4 @@
+{% load i18n %}
+{% autoescape off %}
+{% blocktrans %} Error with Your Audio Annotation Job!{% endblocktrans %}
+{% endautoescape %}
diff --git a/cvat/apps/engine/views.py b/cvat/apps/engine/views.py
index e6d7cce1a5ed..bbf076734d1e 100644
--- a/cvat/apps/engine/views.py
+++ b/cvat/apps/engine/views.py
@@ -27,6 +27,10 @@
 from django.db.models.query import Prefetch
 from django.http import HttpResponse, HttpResponseNotFound, HttpResponseBadRequest
 from django.utils import timezone
+from django.core.mail import send_mail
+from django.contrib.sites.shortcuts import get_current_site
+from django.core.exceptions import ImproperlyConfigured
+from allauth.account.adapter import get_adapter
 
 from drf_spectacular.types import OpenApiTypes
 from drf_spectacular.utils import (
@@ -65,7 +69,7 @@
     LabeledDataSerializer,
     ProjectReadSerializer, ProjectWriteSerializer,
     RqStatusSerializer, TaskReadSerializer, TaskWriteSerializer,
-    UserSerializer, PluginsSerializer, IssueReadSerializer, AIAudioAnnotationSerializer, ExportAudioAnnotationSerializer,
+    UserSerializer, PluginsSerializer, IssueReadSerializer, AIAudioAnnotationSerializer,
     AnnotationGuideReadSerializer, AnnotationGuideWriteSerializer,
     AssetReadSerializer, AssetWriteSerializer,
     IssueWriteSerializer, CommentReadSerializer, CommentWriteSerializer, CloudStorageWriteSerializer,
@@ -708,7 +712,7 @@ def _check_frame_range(self, frame: int):
             raise ValidationError("The frame number doesn't belong to the job")
 
     def __call__(self, request, start, stop, db_data):
-        if self.type == 'chunk' and self.job.segment.type == SegmentType.SPECIFIC_FRAMES:
+        if self.type == 'chunk' and self.job.segment.type == SegmentType.SPECIFIC_FRAMES and self.job.segment.task.data.compressed_chunk_type != 'audio':
             frame_provider = FrameProvider(db_data, self.dimension)
 
             start_chunk = frame_provider.get_chunk_number(start)
@@ -1941,6 +1945,7 @@ def metadata(self, request, pk):
         data_start_frame = db_data.start_frame + start_frame * frame_step
         data_stop_frame = min(db_data.stop_frame, db_data.start_frame + stop_frame * frame_step)
         frame_set = db_job.segment.frame_set
+        segment_size = db_job.segment.task.segment_size
 
         if request.method == 'PATCH':
             serializer = DataMetaWriteSerializer(instance=db_data, data=request.data)
@@ -1982,6 +1987,7 @@ def metadata(self, request, pk):
         db_data.stop_frame = data_stop_frame
         db_data.size = len(frame_set)
         db_data.included_frames = db_job.segment.frames or None
+        db_data.segment_size = segment_size
 
         frame_meta = [{
             'width': item.width,
@@ -2021,6 +2027,30 @@ class AIAudioAnnotationViewSet(viewsets.ModelViewSet):
     filter_fields = []
     filter_backends = []
 
+    def send_annotation_email(self, request, template_name, err=None):
+        job_id = request.data.get('jobId')
+        if settings.EMAIL_BACKEND is None:
+            raise ImproperlyConfigured("Email backend is not configured")
+
+        # Find the user associated with current request
+        user = self.request.user
+
+        target_email = user.email
+        current_site = get_current_site(request)
+        site_name = current_site.name
+        domain = current_site.domain
+        context = {
+            'username': user.username,
+            'domain': domain,
+            'site_name': site_name,
+            'job_id': job_id,
+            'protocol': 'https' if request.is_secure() else 'http'
+        }
+        if err:
+            context['error'] = err
+
+        get_adapter(request).send_mail(f'audio_annotation/{template_name}', target_email, context)
+
     @action(detail=False, methods=['post'], url_path='save')
     def save_segments(self, request):
         try:
@@ -2028,8 +2058,8 @@ def save_segments(self, request):
 
             # Find labels of a particular job
             job = Job.objects.get(id=job_id)
-            labels_queryset = job.get_labels()
-            labels_list = list(labels_queryset.values())
+            # labels_queryset = job.get_labels()
+            # labels_list = list(labels_queryset.values())
 
             segments = request.data.get('segments')
 
@@ -2060,15 +2090,18 @@ def save_segments(self, request):
 
             job.save()
 
+            self.send_annotation_email(request, 'annotation')
             return Response({'success': True, 'segments': saved_segments}, status=status.HTTP_201_CREATED)
 
         except Exception as e:
+            self.send_annotation_email(request, 'error', err=str(e))
             return Response({'error': str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
 
     @action(detail=False, methods=['post'], url_path='ai-annotate')
     def request_ai_annotation(self, request):
         try:
             job_id = request.data.get('jobId')
+            lang = request.data.get('lang')
             authHeader = request.headers.get('Authorization')
 
             # Find labels of a particular job
@@ -2086,7 +2119,10 @@ def request_ai_annotation(self, request):
             job.save()
 
             # Iterate over segments and save to the model
-            r = requests.post("http://35.208.178.37:8000/transcript", json={ "jobId" : job_id, "authToken" : authHeader, "background_task_id" : background_task_id})
+            ai_annotation_host = os.getenv('AI_ANNOTATION_HOST', '35.208.178.37')
+            ai_annotation_port = int(os.getenv('AI_ANNOTATION_PORT', "8000"))
+            url = f"http://{ai_annotation_host}:{ai_annotation_port}/transcript"
+            r = requests.post(url, json={ "jobId" : job_id, "lang" : lang, "authToken" : authHeader, "background_task_id" : background_task_id})
 
             return Response({'success': True}, status=status.HTTP_200_OK)
 
diff --git a/cvat/apps/iam/templates/account/email/email_confirmation_message.html b/cvat/apps/iam/templates/account/email/email_confirmation_message.html
index 4ef34dd474f8..3227fc1054b2 100644
--- a/cvat/apps/iam/templates/account/email/email_confirmation_message.html
+++ b/cvat/apps/iam/templates/account/email/email_confirmation_message.html
@@ -121,9 +121,9 @@
         <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
           <tr>
             <td align="center" valign="bottom">
-              <a href="https://www.cvat.ai/" target="_blank" style="display: inline-block;">
+              <a href="https://www.audino.in/" target="_blank" style="display: inline-block;">
                 <img
-                  src="https://github.com/opencv/cvat/blob/develop/site/content/en/images/cvat_poster_with_name.png?raw=true"
+                  src="https://raw.githubusercontent.com/midas-research/audino/add-docs/docs/assets/banner.png"
                   alt="Logo"
                   border="0"
                   width="100%"
@@ -183,7 +183,7 @@ <h1 style="margin: 0; font-size: 32px; font-weight: 700; letter-spacing: -1px; l
             <td align="left" bgcolor="#ffffff" style="padding: 24px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 16px; line-height: 24px;">
               {% blocktrans with site_name=current_site.name site_domain=current_site.domain %}
               <p>
-                Thank you for signing up for CVAT!
+                Thank you for signing up for Audino!
               </p>
               <p style="margin: 0;">
                 To complete registration and start annotating, simply tap the button below and confirm your email address.
diff --git a/cvat/apps/organizations/templates/invitation/invitation_message.html b/cvat/apps/organizations/templates/invitation/invitation_message.html
index b511462958d4..4b1f05f7653f 100644
--- a/cvat/apps/organizations/templates/invitation/invitation_message.html
+++ b/cvat/apps/organizations/templates/invitation/invitation_message.html
@@ -122,9 +122,9 @@
         <table border="0" cellpadding="0" cellspacing="0" width="100%" style="max-width: 600px;">
           <tr>
             <td align="center" valign="bottom">
-              <a href="https://www.cvat.ai/" target="_blank" style="display: inline-block;">
+              <a href="https://audino.in/" target="_blank" style="display: inline-block;">
                 <img
-                  src="https://github.com/opencv/cvat/blob/develop/site/content/en/images/cvat_poster_with_name.png?raw=true"
+                  src="https://raw.githubusercontent.com/midas-research/audino/add-docs/docs/assets/banner.png"
                   alt="Logo"
                   border="0"
                   width="100%"
@@ -184,7 +184,7 @@ <h1 style="margin: 0; font-size: 32px; font-weight: 700; letter-spacing: -1px; l
             <td align="left" bgcolor="#ffffff" style="padding: 24px; font-family: 'Source Sans Pro', Helvetica, Arial, sans-serif; font-size: 16px; line-height: 24px;">
               {% blocktrans %}
               <p>
-                You're receiving this email because you've been invited to join <strong>{{ organization_name }}</strong> organization in CVAT by <strong>{{ invitation_owner }}</strong> at {{ site_name }}.
+                You're receiving this email because you've been invited to join <strong>{{ organization_name }}</strong> organization in Audino by <strong>{{ invitation_owner }}</strong> at {{ site_name }}.
               </p>
               <p style="margin: 0;">
                 To join organization and start annotating, simply tap the button below and complete registration.
diff --git a/cvat/apps/organizations/templates/invitation/invitation_subject.txt b/cvat/apps/organizations/templates/invitation/invitation_subject.txt
index 4fedaaf7bed2..53ad2eddd1e2 100644
--- a/cvat/apps/organizations/templates/invitation/invitation_subject.txt
+++ b/cvat/apps/organizations/templates/invitation/invitation_subject.txt
@@ -1,4 +1,4 @@
 {% load i18n %}
 {% autoescape off %}
-{% blocktrans %}You're invited to join {{ organization_name }} organization in CVAT!{% endblocktrans %}
+{% blocktrans %}You're invited to join {{ organization_name }} organization in Audino!{% endblocktrans %}
 {% endautoescape %}
diff --git a/cvat/apps/quality_control/migrations/0002_annotationconflict_character_error_rate_and_more.py b/cvat/apps/quality_control/migrations/0002_annotationconflict_character_error_rate_and_more.py
new file mode 100644
index 000000000000..2347c7e75475
--- /dev/null
+++ b/cvat/apps/quality_control/migrations/0002_annotationconflict_character_error_rate_and_more.py
@@ -0,0 +1,41 @@
+# Generated by Django 4.2.6 on 2024-06-15 11:52
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("quality_control", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="annotationconflict",
+            name="character_error_rate",
+            field=models.IntegerField(default=0, null=True),
+        ),
+        migrations.AddField(
+            model_name="annotationconflict",
+            name="word_error_rate",
+            field=models.IntegerField(default=0, null=True),
+        ),
+        migrations.AlterField(
+            model_name="annotationconflict",
+            name="type",
+            field=models.CharField(
+                choices=[
+                    ("missing_annotation", "MISSING_ANNOTATION"),
+                    ("extra_annotation", "EXTRA_ANNOTATION"),
+                    ("mismatching_label", "MISMATCHING_LABEL"),
+                    ("low_overlap", "LOW_OVERLAP"),
+                    ("mismatching_direction", "MISMATCHING_DIRECTION"),
+                    ("mismatching_attributes", "MISMATCHING_ATTRIBUTES"),
+                    ("mismatching_groups", "MISMATCHING_GROUPS"),
+                    ("covered_annotation", "COVERED_ANNOTATION"),
+                    ("mismatching_extra_parameters", "MISMATCHING_EXTRA_PARAMETERS"),
+                    ("mismatching_transcript", "MISMATCHING_TRANSCRIPT"),
+                ],
+                max_length=32,
+            ),
+        ),
+    ]
diff --git a/cvat/apps/quality_control/models.py b/cvat/apps/quality_control/models.py
index e4e39f5fb922..16fe703ceaf4 100644
--- a/cvat/apps/quality_control/models.py
+++ b/cvat/apps/quality_control/models.py
@@ -24,6 +24,8 @@ class AnnotationConflictType(str, Enum):
     MISMATCHING_ATTRIBUTES = "mismatching_attributes"
     MISMATCHING_GROUPS = "mismatching_groups"
     COVERED_ANNOTATION = "covered_annotation"
+    MISMATCHING_EXTRA_PARAMETERS = "mismatching_extra_parameters"
+    MISMATCHING_TRANSCRIPT = "mismatching_transcript"
 
     def __str__(self) -> str:
         return self.value
@@ -134,6 +136,8 @@ class AnnotationConflict(models.Model):
     frame = models.PositiveIntegerField()
     type = models.CharField(max_length=32, choices=AnnotationConflictType.choices())
     severity = models.CharField(max_length=32, choices=AnnotationConflictSeverity.choices())
+    word_error_rate = models.IntegerField(default=0, null=True)
+    character_error_rate = models.IntegerField(default=0, null=True)
 
     annotation_ids: Sequence[AnnotationId]
 
diff --git a/cvat/apps/quality_control/quality_reports.py b/cvat/apps/quality_control/quality_reports.py
index 1f3ff5682569..11e73165ba4e 100644
--- a/cvat/apps/quality_control/quality_reports.py
+++ b/cvat/apps/quality_control/quality_reports.py
@@ -107,6 +107,8 @@ class AnnotationConflict(_Serializable):
     frame_id: int
     type: AnnotationConflictType
     annotation_ids: List[AnnotationId]
+    word_error_rate: Optional[float] = None
+    character_error_rate: Optional[float] = None
 
     @property
     def severity(self) -> AnnotationConflictSeverity:
@@ -114,6 +116,7 @@ def severity(self) -> AnnotationConflictSeverity:
             AnnotationConflictType.MISSING_ANNOTATION,
             AnnotationConflictType.EXTRA_ANNOTATION,
             AnnotationConflictType.MISMATCHING_LABEL,
+            AnnotationConflictType.MISMATCHING_TRANSCRIPT,
         ]:
             severity = AnnotationConflictSeverity.ERROR
         elif self.type in [
@@ -122,6 +125,7 @@ def severity(self) -> AnnotationConflictSeverity:
             AnnotationConflictType.MISMATCHING_DIRECTION,
             AnnotationConflictType.MISMATCHING_GROUPS,
             AnnotationConflictType.COVERED_ANNOTATION,
+            AnnotationConflictType.MISMATCHING_EXTRA_PARAMETERS,
         ]:
             severity = AnnotationConflictSeverity.WARNING
         else:
@@ -144,6 +148,8 @@ def from_dict(cls, d: dict):
             frame_id=d["frame_id"],
             type=AnnotationConflictType(d["type"]),
             annotation_ids=list(AnnotationId.from_dict(v) for v in d["annotation_ids"]),
+            word_error_rate=d["word_error_rate"],
+            character_error_rate =d["character_error_rate"],
         )
 
 
@@ -161,6 +167,15 @@ class ComparisonParameters(_Serializable):
     compare_attributes: bool = True
     "Enables or disables attribute checks"
 
+    compare_extra_parameters: bool = True
+    "Enables or disables extra parameters checks for audio data"
+
+    wer_threshold: float = 0.2
+    "Used for distinction between matched and unmatched transcript at word level"
+
+    cer_threshold: float = 0.2
+    "Used for distinction between matched and unmatched transcript at character level"
+
     ignored_attributes: List[str] = []
 
     iou_threshold: float = 0.4
@@ -2077,6 +2092,572 @@ def generate_report(self) -> ComparisonReport:
         )
 
 
+class AudioDatasetComparator:
+    DEFAULT_SETTINGS = ComparisonParameters()
+
+    def __init__(
+        self,
+        ds_data_provider: JobDataProvider,
+        gt_data_provider: JobDataProvider,
+        offset,
+        job_duration,
+        *,
+        settings: Optional[ComparisonParameters] = None,
+    ) -> None:
+        if settings is None:
+            settings = self.DEFAULT_SETTINGS
+        self.settings = settings
+
+        self._ds_data_provider = ds_data_provider
+        self._gt_data_provider = gt_data_provider
+        self._offset = offset
+        self._job_duration = job_duration
+        self._job_id = self._ds_data_provider.job_id
+
+        self._frame_results: Dict[int, ComparisonReportFrameSummary] = {}
+        self.included_frames = gt_data_provider.job_data._db_job.segment.frame_set
+
+        self.iou_threshold = settings.iou_threshold
+        self.wer_threshold = settings.wer_threshold
+        self.cer_threshold = settings.cer_threshold
+
+        self.ignored_attrs = set(settings.ignored_attributes) | {
+            "track_id",  # changes from task to task, can't be defined manually with the same name
+            "keyframe",  # indicates the way annotation obtained, meaningless to compare
+            "z_order",  # changes from frame to frame, compared by other means
+            "group",  # changes from job to job, compared by other means
+            "rotation",  # handled by other means
+            "outside",  # handled by other means
+        }
+
+    def _dm_ann_to_ann_id(self, ann):
+        if ann in self._ds_data_provider.job_annotation.data['shapes']:
+            source_data_provider = self._ds_data_provider
+        elif ann in self._gt_data_provider.job_annotation.data['shapes']:
+            source_data_provider = self._gt_data_provider
+        else:
+            assert False
+
+        source_ann_id = ann['id']
+        ann_type = AnnotationType.SHAPE
+        shape_type = ann['type']
+
+        return AnnotationId(
+            obj_id=source_ann_id, type=ann_type, shape_type=shape_type, job_id=source_data_provider.job_id
+        )
+
+    def _find_audio_gt_conflicts(self):
+        start = self._ds_data_provider.job_data.start
+        end = self._ds_data_provider.job_data.stop - 1
+        gt_frame_list =  self._gt_data_provider.job_data._db_job.segment.frames
+
+        # Check if any frame in gt_data_frame_array is in ds_data_frame_array
+        if not (start in gt_frame_list or end in gt_frame_list):
+            return # we need to compare only intersecting jobs
+
+        ds_annotations = self._ds_data_provider.job_annotation.data['shapes']
+        gt_annotations = self._gt_data_provider.job_annotation.data['shapes']
+
+        self._process_job(ds_annotations, gt_annotations)
+
+    def _process_job(self, ds_annotations, gt_annotations):
+        job_id = self._job_id
+        job_results = self.match_annotations(ds_annotations, gt_annotations)
+        self._frame_results.setdefault(job_id, {})
+
+        self._generate_job_annotation_conflicts(
+            job_results, gt_annotations, ds_annotations
+        )
+
+    def match_annotations(self, ds_annotations, gt_annotations):
+        """
+        Match annotations between two datasets.
+        This method should compare annotations based on their start and end times.
+        """
+        def _interval_iou(interval1, interval2):
+            start1, end1 = interval1
+            start2, end2 = interval2
+
+            start2 += self._offset
+            end2 += self._offset
+
+            intersection = max(0, min(end1, end2) - max(start1, start2))
+            union = max(end1, end2) - min(start1, start2)
+            return intersection / union if union > 0 else 0
+
+        job_start_time = self._offset - 0.1
+        job_end_time = job_start_time + self._job_duration + 0.1
+
+        # Filter gt_annotations to include only those within the job's time bounds
+        gt_annotations = [
+            gt_ann for gt_ann in gt_annotations
+            if job_start_time <= gt_ann['points'][0] and gt_ann['points'][3] <= job_end_time
+        ]
+
+
+        matches = []
+        mismatches = []
+        gt_unmatched = gt_annotations.copy()
+        ds_unmatched = ds_annotations.copy()
+        pairwise_distances = {}
+
+        for gt_ann in gt_annotations:
+            matched = False
+            best_mismatch_pair = None
+            best_mismatch_iou = 0  # Initial best IoU for mismatches
+
+            for ds_ann in ds_annotations:
+                gt_interval = (gt_ann['points'][0], gt_ann['points'][3])
+                ds_interval = (ds_ann['points'][0], ds_ann['points'][3])
+                iou = _interval_iou(gt_interval, ds_interval)
+
+                if gt_ann['label_id'] == ds_ann['label_id']:
+                    if iou >= self.iou_threshold:
+                        matches.append((gt_ann, ds_ann))
+                        pairwise_distances[(id(gt_ann), id(ds_ann))] = iou
+                        if gt_ann in gt_unmatched:
+                            gt_unmatched.remove(gt_ann)
+                        if ds_ann in ds_unmatched:
+                            ds_unmatched.remove(ds_ann)
+                        matched = True
+                else:
+                    # Update best mismatch if this is the highest IoU seen so far
+                    if iou > best_mismatch_iou:
+                        best_mismatch_iou = iou
+                        best_mismatch_pair = (gt_ann, ds_ann)
+
+            # If no match was found and there is a best mismatch pair
+            if not matched and best_mismatch_pair is not None and best_mismatch_iou >= self.iou_threshold:
+                mismatches.append(best_mismatch_pair)
+                pairwise_distances[(id(best_mismatch_pair[0]), id(best_mismatch_pair[1]))] = best_mismatch_iou
+
+        return [matches, mismatches, gt_unmatched, ds_unmatched, pairwise_distances]
+
+    def match_attrs(self, ann_a, ann_b):  #ann_a -> gt, ann_b -> ds
+        a_attrs = ann_a['attributes']
+        b_attrs = ann_b['attributes']
+
+        matches = []
+        a_unmatched = a_attrs.copy()
+        b_unmatched = b_attrs.copy()
+
+        for a_attr in a_attrs:
+            for b_attr in b_attrs:
+                if a_attr['spec_id'] == b_attr['spec_id'] and a_attr['value'] == b_attr['value']:
+                    matches.append((a_attr, b_attr))
+                    if a_attr in a_unmatched:
+                        a_unmatched.remove(a_attr)
+                    if b_attr in b_unmatched:
+                        b_unmatched.remove(b_attr)
+                    break  # Once matched, move to the next a_attr
+
+        return matches, a_unmatched, b_unmatched
+
+    def match_extra_parameters(self, gt_ann, ds_ann):
+        parameters = ['Gender', 'Locale', 'Accent', 'Emotion', 'Age']
+        matches = []
+        mismatches = []
+        for param in parameters:
+            if gt_ann.get(param) == ds_ann.get(param):
+                matches.append(param)
+            else:
+                mismatches.append(param)
+
+        return matches, mismatches
+
+
+    def calculate_wer(self, gt_transcript, ds_transcript):
+        """
+        Calculate the Word Error Rate (WER) between a ground truth transcript and an annotated transcript.
+        """
+
+        gt_transcript = gt_transcript.lower()
+        ds_transcript = ds_transcript.lower()
+
+        gt_words = gt_transcript.split()
+        ds_words = ds_transcript.split()
+
+        if len(gt_words) == 0:
+            if len(ds_words) == 0:
+                return 0.0  # Both transcripts are empty
+            else:
+                return 1.0  # Ground truth transcript is empty but annotation transcript is not
+
+        d = np.zeros((len(gt_words) + 1, len(ds_words) + 1), dtype=int)
+
+        for i in range(len(gt_words) + 1):
+            d[i][0] = i
+        for j in range(len(ds_words) + 1):
+            d[0][j] = j
+
+        for i in range(1, len(gt_words) + 1):
+            for j in range(1, len(ds_words) + 1):
+                if gt_words[i - 1] == ds_words[j - 1]:
+                    d[i][j] = d[i - 1][j - 1]
+                else:
+                    d[i][j] = min(d[i - 1][j] + 1,  # deletion
+                                d[i][j - 1] + 1,  # insertion
+                                d[i - 1][j - 1] + 1)  # substitution
+
+        wer = d[len(gt_words)][len(ds_words)] / float(len(gt_words))
+        return wer
+
+    def calculate_cer(self, gt_transcript, ds_transcript):
+        """
+        Calculate the Character Error Rate (CER) between a ground truth transcript and an annotated transcript.
+        """
+
+        gt_transcript = gt_transcript.lower()
+        ds_transcript = ds_transcript.lower()
+
+        gt_chars = list(gt_transcript)
+        ds_chars = list(ds_transcript)
+
+        if len(gt_chars) == 0:
+            if len(ds_chars) == 0:
+                return 0.0  # Both transcripts are empty
+            else:
+                return 1.0  # Ground truth transcript is empty but annotation transcript is not
+
+        d = np.zeros((len(gt_chars) + 1, len(ds_chars) + 1), dtype=int)
+
+        for i in range(len(gt_chars) + 1):
+            d[i][0] = i
+        for j in range(len(ds_chars) + 1):
+            d[0][j] = j
+
+        for i in range(1, len(gt_chars) + 1):
+            for j in range(1, len(ds_chars) + 1):
+                if gt_chars[i - 1] == ds_chars[j - 1]:
+                    d[i][j] = d[i - 1][j - 1]
+                else:
+                    d[i][j] = min(d[i - 1][j] + 1,  # deletion
+                                  d[i][j - 1] + 1,  # insertion
+                                  d[i - 1][j - 1] + 1)  # substitution
+
+        cer = d[len(gt_chars)][len(ds_chars)] / float(len(gt_chars))
+        return cer
+
+
+    def _generate_job_annotation_conflicts(
+        self, job_results, gt_annotations, ds_annotations
+    ) -> List[AnnotationConflict]:
+        conflicts = []
+        job_id = self._job_id
+
+        matches, mismatches, gt_unmatched, ds_unmatched, pairwise_distances = job_results
+
+        for unmatched_ann in gt_unmatched:
+            conflicts.append(
+                AnnotationConflict(
+                    frame_id=job_id,
+                    type=AnnotationConflictType.MISSING_ANNOTATION,
+                    annotation_ids=[self._dm_ann_to_ann_id(unmatched_ann)],
+                )
+            )
+
+        for unmatched_ann in ds_unmatched:
+            conflicts.append(
+                AnnotationConflict(
+                    frame_id=job_id,
+                    type=AnnotationConflictType.EXTRA_ANNOTATION,
+                    annotation_ids=[self._dm_ann_to_ann_id(unmatched_ann)],
+                )
+            )
+
+        for gt_ann, ds_ann in mismatches:
+            conflicts.append(
+                AnnotationConflict(
+                    frame_id=job_id,
+                    type=AnnotationConflictType.MISMATCHING_LABEL,
+                    annotation_ids=[
+                        self._dm_ann_to_ann_id(gt_ann),
+                        self._dm_ann_to_ann_id(ds_ann)
+                    ],
+                )
+            )
+
+        for gt_ann, ds_ann in matches:
+            gt_transcript = gt_ann['transcript']
+            ds_transcript = ds_ann['transcript']
+            wer = self.calculate_wer(gt_transcript, ds_transcript)
+            cer = self.calculate_cer(gt_transcript, ds_transcript)
+            if wer > self.wer_threshold or cer > self.cer_threshold:
+                conflicts.append(
+                    AnnotationConflict(
+                        frame_id=job_id,
+                        type=AnnotationConflictType.MISMATCHING_TRANSCRIPT,
+                        annotation_ids=[
+                            self._dm_ann_to_ann_id(gt_ann),
+                            self._dm_ann_to_ann_id(ds_ann),
+                        ],
+                        word_error_rate=wer,
+                        character_error_rate=cer,
+                    )
+                )
+
+        if self.settings.compare_attributes:
+            for gt_ann, ds_ann in matches:
+                attribute_results = self.match_attrs(gt_ann, ds_ann)
+                if any(attribute_results[1:]):
+                    conflicts.append(
+                        AnnotationConflict(
+                            frame_id=job_id,
+                            type=AnnotationConflictType.MISMATCHING_ATTRIBUTES,
+                            annotation_ids=[
+                                self._dm_ann_to_ann_id(gt_ann),
+                                self._dm_ann_to_ann_id(ds_ann),
+                            ],
+                        )
+                    )
+
+        if self.settings.compare_extra_parameters:
+            for gt_ann, ds_ann in matches:
+                extra_parameter_results = self.match_extra_parameters(gt_ann, ds_ann)
+                if any(extra_parameter_results[1:]):
+                    conflicts.append(
+                        AnnotationConflict(
+                            frame_id=job_id,
+                            type=AnnotationConflictType.MISMATCHING_EXTRA_PARAMETERS,
+                            annotation_ids=[
+                                self._dm_ann_to_ann_id(gt_ann),
+                                self._dm_ann_to_ann_id(ds_ann),
+                            ],
+                        )
+                    )
+
+        valid_shapes_count = len(matches) + len(mismatches)
+        missing_shapes_count = len(gt_unmatched)
+        extra_shapes_count = len(ds_unmatched)
+        total_shapes_count = len(matches) + len(mismatches) + len(gt_unmatched) + len(ds_unmatched)
+        ds_shapes_count = len(matches) + len(mismatches) + len(ds_unmatched)
+        gt_shapes_count = len(matches) + len(mismatches) + len(gt_unmatched)
+
+        valid_labels_count = len(matches)
+        invalid_labels_count = len(mismatches)
+        total_labels_count = valid_labels_count + invalid_labels_count
+
+        # Get labels from project returns a queryset)
+        labels_queryset = self._ds_data_provider.job_data._db_task.project.get_labels()
+
+        # Convert queryset to a dictionary of labels
+        confusion_matrix_labels = {
+            label.id: label.name
+            for i, label in enumerate(labels_queryset)
+            if not label.parent
+        }
+        confusion_matrix_labels[None] = "unmatched"
+        confusion_matrix_labels_rmap = {k: i for i, k in enumerate(confusion_matrix_labels.keys())}
+        confusion_matrix_label_count = len(confusion_matrix_labels)
+        confusion_matrix = np.zeros(
+            (confusion_matrix_label_count, confusion_matrix_label_count), dtype=int
+        )
+        for gt_ann, ds_ann in itertools.chain(
+            # fully matched annotations - shape, label, attributes
+            matches,
+            mismatches,
+            zip(itertools.repeat(None), ds_unmatched),
+            zip(gt_unmatched, itertools.repeat(None)),
+        ):
+            ds_label_idx = confusion_matrix_labels_rmap[ds_ann["label_id"] if ds_ann else None]
+            gt_label_idx = confusion_matrix_labels_rmap[gt_ann["label_id"] if gt_ann else None]
+            confusion_matrix[ds_label_idx, gt_label_idx] += 1
+
+        matched_ann_counts = np.diag(confusion_matrix)
+        ds_ann_counts = np.sum(confusion_matrix, axis=1)
+        gt_ann_counts = np.sum(confusion_matrix, axis=0)
+        label_accuracies = _arr_div(
+            matched_ann_counts, ds_ann_counts + gt_ann_counts - matched_ann_counts
+        )
+        label_precisions = _arr_div(matched_ann_counts, ds_ann_counts)
+        label_recalls = _arr_div(matched_ann_counts, gt_ann_counts)
+
+        valid_annotations_count = np.sum(matched_ann_counts)
+        missing_annotations_count = np.sum(confusion_matrix[confusion_matrix_labels_rmap[None], :])
+        extra_annotations_count = np.sum(confusion_matrix[:, confusion_matrix_labels_rmap[None]])
+        total_annotations_count = np.sum(confusion_matrix)
+        ds_annotations_count = (
+            np.sum(ds_ann_counts) - ds_ann_counts[confusion_matrix_labels_rmap[None]]
+        )
+        gt_annotations_count = (
+            np.sum(gt_ann_counts) - gt_ann_counts[confusion_matrix_labels_rmap[None]]
+        )
+
+        self._frame_results[job_id] = ComparisonReportFrameSummary(
+            annotations=ComparisonReportAnnotationsSummary(
+                valid_count=valid_annotations_count,
+                missing_count=missing_annotations_count,
+                extra_count=extra_annotations_count,
+                total_count=total_annotations_count,
+                ds_count=ds_annotations_count,
+                gt_count=gt_annotations_count,
+                confusion_matrix=ConfusionMatrix(
+                    labels=list(confusion_matrix_labels.values()),
+                    rows=confusion_matrix,
+                    precision=label_precisions,
+                    recall=label_recalls,
+                    accuracy=label_accuracies,
+                ),
+            ),
+            annotation_components=ComparisonReportAnnotationComponentsSummary(
+                shape=ComparisonReportAnnotationShapeSummary(
+                    valid_count=valid_shapes_count,
+                    missing_count=missing_shapes_count,
+                    extra_count=extra_shapes_count,
+                    total_count=total_shapes_count,
+                    ds_count=ds_shapes_count,
+                    gt_count=gt_shapes_count,
+                    mean_iou=0.7,
+                ),
+                label=ComparisonReportAnnotationLabelSummary(
+                    valid_count=valid_labels_count,
+                    invalid_count=invalid_labels_count,
+                    total_count=total_labels_count,
+                ),
+            ),
+            conflicts=conflicts,
+        )
+
+        return conflicts
+
+
+    def generate_audio_report(self) -> ComparisonReport:
+        self._find_audio_gt_conflicts()
+
+        # accumulate stats
+        intersection_frames = []
+        conflicts = []
+        annotations = ComparisonReportAnnotationsSummary(
+            valid_count=0,
+            missing_count=0,
+            extra_count=0,
+            total_count=0,
+            ds_count=0,
+            gt_count=0,
+            confusion_matrix=None,
+        )
+        annotation_components = ComparisonReportAnnotationComponentsSummary(
+            shape=ComparisonReportAnnotationShapeSummary(
+                valid_count=0,
+                missing_count=0,
+                extra_count=0,
+                total_count=0,
+                ds_count=0,
+                gt_count=0,
+                mean_iou=0,
+            ),
+            label=ComparisonReportAnnotationLabelSummary(
+                valid_count=0,
+                invalid_count=0,
+                total_count=0,
+            ),
+        )
+        mean_ious = []
+        confusion_matrices = []
+
+        for job_id, job_result in self._frame_results.items():
+            intersection_frames.append(job_id)
+            conflicts += job_result.conflicts
+
+            if annotations is None:
+                annotations = deepcopy(job_result.annotations)
+            else:
+                annotations.accumulate(job_result.annotations)
+            confusion_matrices.append(job_result.annotations.confusion_matrix.rows)
+
+            if annotation_components is None:
+                annotation_components = deepcopy(job_result.annotation_components)
+            else:
+                annotation_components.accumulate(job_result.annotation_components)
+            mean_ious.append(job_result.annotation_components.shape.mean_iou)
+
+        # Get labels from project returns a queryset)
+        labels_queryset = self._ds_data_provider.job_data._db_task.project.get_labels()
+
+        # Convert queryset to a dictionary of labels
+        confusion_matrix_labels = {
+            label.id: label.name
+            for i, label in enumerate(labels_queryset)
+            if not label.parent
+        }
+        confusion_matrix_labels[None] = "unmatched"
+        confusion_matrix_labels_rmap = {k: i for i, k in enumerate(confusion_matrix_labels.keys())}
+        if confusion_matrices:
+            confusion_matrix = np.sum(confusion_matrices, axis=0)
+        else:
+            confusion_matrix = np.zeros(
+                (len(confusion_matrix_labels), len(confusion_matrix_labels)), dtype=int
+            )
+        matched_ann_counts = np.diag(confusion_matrix)
+        ds_ann_counts = np.sum(confusion_matrix, axis=1)
+        gt_ann_counts = np.sum(confusion_matrix, axis=0)
+        label_accuracies = _arr_div(
+            matched_ann_counts, ds_ann_counts + gt_ann_counts - matched_ann_counts
+        )
+        label_precisions = _arr_div(matched_ann_counts, ds_ann_counts)
+        label_recalls = _arr_div(matched_ann_counts, gt_ann_counts)
+
+        valid_annotations_count = np.sum(matched_ann_counts)
+        missing_annotations_count = np.sum(confusion_matrix[confusion_matrix_labels_rmap[None], :])
+        extra_annotations_count = np.sum(confusion_matrix[:, confusion_matrix_labels_rmap[None]])
+        total_annotations_count = np.sum(confusion_matrix)
+        ds_annotations_count = (
+            np.sum(ds_ann_counts) - ds_ann_counts[confusion_matrix_labels_rmap[None]]
+        )
+        gt_annotations_count = (
+            np.sum(gt_ann_counts) - gt_ann_counts[confusion_matrix_labels_rmap[None]]
+        )
+
+        return ComparisonReport(
+            parameters=self.settings,
+            comparison_summary=ComparisonReportComparisonSummary(
+                frame_share=(
+                    len(intersection_frames) / (len(self._ds_data_provider.job_data.rel_range) or 1)
+                ),
+                frames=intersection_frames,
+                conflict_count=len(conflicts),
+                warning_count=len(
+                    [c for c in conflicts if c.severity == AnnotationConflictSeverity.WARNING]
+                ),
+                error_count=len(
+                    [c for c in conflicts if c.severity == AnnotationConflictSeverity.ERROR]
+                ),
+                conflicts_by_type=Counter(c.type for c in conflicts),
+                annotations=ComparisonReportAnnotationsSummary(
+                    valid_count=valid_annotations_count,
+                    missing_count=missing_annotations_count,
+                    extra_count=extra_annotations_count,
+                    total_count=total_annotations_count,
+                    ds_count=ds_annotations_count,
+                    gt_count=gt_annotations_count,
+                    confusion_matrix=ConfusionMatrix(
+                        labels=list(confusion_matrix_labels.values()),
+                        rows=confusion_matrix,
+                        precision=label_precisions,
+                        recall=label_recalls,
+                        accuracy=label_accuracies,
+                    ),
+                ),
+                annotation_components=ComparisonReportAnnotationComponentsSummary(
+                    shape=ComparisonReportAnnotationShapeSummary(
+                        valid_count=annotation_components.shape.valid_count,
+                        missing_count=annotation_components.shape.missing_count,
+                        extra_count=annotation_components.shape.extra_count,
+                        total_count=annotation_components.shape.total_count,
+                        ds_count=annotation_components.shape.ds_count,
+                        gt_count=annotation_components.shape.gt_count,
+                        mean_iou=np.mean(mean_ious),
+                    ),
+                    label=ComparisonReportAnnotationLabelSummary(
+                        valid_count=annotation_components.label.valid_count,
+                        invalid_count=annotation_components.label.invalid_count,
+                        total_count=annotation_components.label.total_count,
+                    ),
+                ),
+            ),
+            frame_results=self._frame_results,
+        )
+
 class QualityReportUpdateManager:
     _QUEUE_JOB_PREFIX = "update-quality-metrics-task-"
     _RQ_CUSTOM_QUALITY_CHECK_JOB_TYPE = "custom_quality_check"
@@ -2277,6 +2858,7 @@ def _compute_reports(self, task_id: int) -> int:
             gt_job_frames = gt_job_data_provider.job_data.get_included_frames()
 
             jobs: List[Job] = [j for j in job_queryset if j.type == JobType.ANNOTATION]
+            jobs = sorted(jobs, key=lambda job: job.id)
             job_data_providers = {
                 job.id: JobDataProvider(
                     job.id, queryset=job_queryset, included_frames=gt_job_frames
@@ -2286,14 +2868,31 @@ def _compute_reports(self, task_id: int) -> int:
 
             quality_params = self._get_task_quality_params(task)
 
+        job_duration = ((task.data.chunk_size) * (task.audio_total_duration) / (task.data.stop_frame+1)) / 1000 # in seconds
+
         job_comparison_reports: Dict[int, ComparisonReport] = {}
+        ind = 0 # index count for offset in intersecting jobs
         for job in jobs:
-            job_data_provider = job_data_providers[job.id]
-            comparator = DatasetComparator(
-                job_data_provider, gt_job_data_provider, settings=quality_params
+            job_id = job.id
+            job_data_provider = job_data_providers[job_id]
+            # comparator = DatasetComparator(
+            #     job_data_provider, gt_job_data_provider, settings=quality_params
+            # )
+            # job_comparison_reports[job.id] = comparator.generate_report()
+            offset = ind * job_duration # required only when jobs are intersecting
+
+            start = job_data_provider.job_data.start
+            end = job_data_provider.job_data.stop - 1
+            gt_frame_list = list(gt_job_frames)
+            if not (start in gt_frame_list or end in gt_frame_list):
+                offset = 0
+                ind -= 1
+
+            comparator = AudioDatasetComparator(
+                job_data_provider, gt_job_data_provider,offset, job_duration, settings=quality_params
             )
-            job_comparison_reports[job.id] = comparator.generate_report()
-
+            job_comparison_reports[job_id] = comparator.generate_audio_report()
+            ind += 1
             # Release resources
             del job_data_provider.dm_dataset
 
@@ -2458,6 +3057,8 @@ def _save_reports(self, *, task_report: Dict, job_reports: List[Dict]) -> models
                     type=conflict["type"],
                     frame=conflict["frame_id"],
                     severity=conflict["severity"],
+                    word_error_rate=conflict["word_error_rate"],
+                    character_error_rate=conflict["character_error_rate"],
                 )
                 db_conflicts.append(db_conflict)
 
diff --git a/cvat/apps/quality_control/serializers.py b/cvat/apps/quality_control/serializers.py
index 711799dcef61..dbac6ee6ced3 100644
--- a/cvat/apps/quality_control/serializers.py
+++ b/cvat/apps/quality_control/serializers.py
@@ -21,7 +21,7 @@ class AnnotationConflictSerializer(serializers.ModelSerializer):
 
     class Meta:
         model = models.AnnotationConflict
-        fields = ("id", "frame", "type", "annotation_ids", "report_id", "severity")
+        fields = ("id", "frame", "type", "annotation_ids", "report_id", "severity","word_error_rate","character_error_rate")
         read_only_fields = fields
 
 
diff --git a/cvat/requirements/base.in b/cvat/requirements/base.in
index c4a1380d961c..6a0e272abd11 100644
--- a/cvat/requirements/base.in
+++ b/cvat/requirements/base.in
@@ -53,4 +53,5 @@ rq==1.15.1
 rules>=3.3
 Shapely==1.7.1
 tensorflow==2.11.1 # Optional requirement of Datumaro. Use tensorflow-macos==2.8.0 for Mac M1
-soundfile==0.12.1
\ No newline at end of file
+soundfile==0.12.1
+chardet==5.2.0
\ No newline at end of file
diff --git a/cvat/requirements/base.txt b/cvat/requirements/base.txt
index bcccfb1ee658..ca16427706b4 100644
--- a/cvat/requirements/base.txt
+++ b/cvat/requirements/base.txt
@@ -409,3 +409,4 @@ setuptools==68.2.2
     #   tensorflow
 
 soundfile==0.12.1
+chardet==5.2.0
\ No newline at end of file
diff --git a/cvat/requirements/development.in b/cvat/requirements/development.in
index 4d824be221cc..de43d0a947cd 100644
--- a/cvat/requirements/development.in
+++ b/cvat/requirements/development.in
@@ -8,4 +8,5 @@ pylint-plugin-utils==0.7
 pylint==2.14.5
 rope==0.17.0
 snakeviz==2.1.0
-soundfile==0.12.1
\ No newline at end of file
+soundfile==0.12.1
+chardet==5.2.0
\ No newline at end of file
diff --git a/cvat/requirements/development.txt b/cvat/requirements/development.txt
index 2d36b030a02b..a1cd030bdad6 100644
--- a/cvat/requirements/development.txt
+++ b/cvat/requirements/development.txt
@@ -62,5 +62,5 @@ tornado==6.3.3
     # via snakeviz
 
 soundfile==0.12.1
-
+chardet==5.2.0
 # The following packages are considered to be unsafe in a requirements file:
diff --git a/cvat/requirements/production.txt b/cvat/requirements/production.txt
index 16360b4e3553..16db54eaefda 100644
--- a/cvat/requirements/production.txt
+++ b/cvat/requirements/production.txt
@@ -29,4 +29,5 @@ watchfiles==0.20.0
 websockets==11.0.3
     # via uvicorn
 soundfile==0.12.1
+chardet==5.2.0
 # The following packages are considered to be unsafe in a requirements file:
diff --git a/cvat/settings/base.py b/cvat/settings/base.py
index e3d2e6cebfd1..910a6b9838fc 100644
--- a/cvat/settings/base.py
+++ b/cvat/settings/base.py
@@ -204,7 +204,7 @@ def generate_secret_key():
     'cvat.apps.iam.views.ContextMiddleware',
 ]
 
-UI_URL = ''
+UI_URL = 'https://app.audino.in'
 
 STATICFILES_FINDERS = [
     'django.contrib.staticfiles.finders.FileSystemFinder',
@@ -270,9 +270,9 @@ def GET_IAM_DEFAULT_ROLES(user) -> list:
 
 # set UI url to redirect after a successful e-mail confirmation
 #changed from '/auth/login' to '/auth/email-confirmation' for email confirmation message
-ACCOUNT_EMAIL_CONFIRMATION_ANONYMOUS_REDIRECT_URL = '/auth/email-confirmation'
-ACCOUNT_EMAIL_VERIFICATION_SENT_REDIRECT_URL = '/auth/email-verification-sent'
-INCORRECT_EMAIL_CONFIRMATION_URL = '/auth/incorrect-email-confirmation'
+ACCOUNT_EMAIL_CONFIRMATION_ANONYMOUS_REDIRECT_URL = f'{UI_URL}/auth/email-confirmation'
+ACCOUNT_EMAIL_VERIFICATION_SENT_REDIRECT_URL = f'{UI_URL}/auth/email-verification-sent'
+INCORRECT_EMAIL_CONFIRMATION_URL = f'{UI_URL}/auth/incorrect-email-confirmation'
 
 OLD_PASSWORD_FIELD_ENABLED = True
 
@@ -568,6 +568,7 @@ class CVAT_QUEUES(Enum):
     'upload-finish',
     'upload-multiple',
     'x-organization',
+    'upload-metadata',
 ]
 
 TUS_MAX_FILE_SIZE = 26843545600 # 25gb
diff --git a/cvat/settings/email_settings.py b/cvat/settings/email_settings.py
index d3f9621e09d4..c40866a12d48 100644
--- a/cvat/settings/email_settings.py
+++ b/cvat/settings/email_settings.py
@@ -10,7 +10,7 @@
 ACCOUNT_AUTHENTICATION_METHOD = 'username_email'
 ACCOUNT_CONFIRM_EMAIL_ON_GET = True
 ACCOUNT_EMAIL_REQUIRED = True
-ACCOUNT_EMAIL_VERIFICATION = 'mandatory'
+ACCOUNT_EMAIL_VERIFICATION = 'none'
 
 # Email backend settings for Django
 EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'