midas-research · rohan220217 · Apr 20, 2024 · Apr 26, 2024 · May 6, 2024 · May 6, 2024
diff --git a/cvat/apps/dataset_manager/task.py b/cvat/apps/dataset_manager/task.py
@@ -11,6 +11,7 @@
 import os
 import json
 import zipfile
+from pydub import AudioSegment
 from scipy.io import wavfile
 import numpy as np
 from collections import OrderedDict
@@ -19,17 +20,17 @@
 from tempfile import TemporaryDirectory
 from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError
 
-from django.conf import settings
+# from django.conf import settings
 from django.db import transaction
 from django.db.models.query import Prefetch
+from cvat.apps.engine.models import Job, AttributeSpec
 from django.utils import timezone
 from rest_framework.exceptions import ValidationError
 
 from cvat.apps.engine import models, serializers
 from cvat.apps.engine.plugins import plugin_decorator
 from cvat.apps.events.handlers import handle_annotations_change
 from cvat.apps.profiler import silk_profile
-from cvat.apps.engine.cache import MediaCache
 from cvat.apps.engine.frame_provider import FrameProvider
 from cvat.apps.dataset_manager.annotation import AnnotationIR, AnnotationManager
 from cvat.apps.dataset_manager.bindings import TaskData, JobData, CvatImportError
@@ -878,9 +879,6 @@ def jobChunkPathGetter(db_data, start, stop, task_dimension, data_quality, data_
     # db_data = Task Data
     frame_provider = FrameProvider(db_data, task_dimension)
 
-    start_chunk = frame_provider.get_chunk_number(start)
-    stop_chunk = frame_provider.get_chunk_number(stop)
-
     # self.type = data_type
     number = int(data_num) if data_num is not None else None
 
@@ -894,15 +892,20 @@ def jobChunkPathGetter(db_data, start, stop, task_dimension, data_quality, data_
 
     return path
 
-def chunk_annotation_audio(audio_file, output_folder, annotations):
-    # Load audio
-    # y, sr = librosa.load(audio_file, sr=None)
-    sr, y = wavfile.read(audio_file)
+def chunk_annotation_audio(concat_array, output_folder, annotations):
+    # Convert NumPy array to AudioSegment
+    sr = 44100 # sampling rate
+    audio_segment = AudioSegment(concat_array.tobytes(), frame_rate=sr, channels=1, sample_width=4)
+
+    try:
+        y = audio_segment.get_array_of_samples()
+    except Exception as e:
+        return None
 
     data = []
-    # Loop over shapes
+
     for i, shape in enumerate(annotations, 1):
-        # Extract transcript and time points
+
         start_time = min(shape['points'][:2])
         end_time = max(shape['points'][2:])
 
@@ -914,14 +917,11 @@ def chunk_annotation_audio(audio_file, output_folder, annotations):
         chunk = y[start_sample:end_sample]
 
         clip_uuid = str(uuid.uuid4())
-        # Save the chunk with transcript as filename
-        output_file = os.path.join(output_folder, f"{clip_uuid}.wav")
+        output_file = os.path.join(output_folder, f"{clip_uuid}.mp3")
         soundfile.write(output_file, chunk, sr)
 
         data.append(output_file)
 
-        # logger.info(f"Annotation {str(i)} Chunk saved: {output_file}")
-
     return data
 
 def create_annotation_clips_zip(annotation_audio_chunk_file_paths, meta_data_file_path, output_folder, dst_file):
@@ -959,15 +959,14 @@ def get_np_audio_array_from_job(job_id):
 
     job_data_chunk_size = job.db_job.segment.task.data.chunk_size
     task_dimension = job.db_job.segment.task.dimension
-    storage_method = job.db_job.segment.task.data.storage_method
 
     start = job.start_frame/job_data_chunk_size
     stop = job.stop_frame/job_data_chunk_size
 
     audio_array_buffer = []
     for i in range(math.trunc(start), math.trunc(stop)+1):
         db_job = job.db_job
-        data_type = "chunk"
+        # data_type = "chunk"
         data_num = i
         data_quality = 'compressed'
 
@@ -993,30 +992,68 @@ def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
     # All Annotations
     annotations = job.data["shapes"]
 
-    audio_file_path = os.path.join(temp_dir, str(job_id) + ".wav")
-    with wave.open(audio_file_path, 'wb') as wave_file:
-        wave_file.setnchannels(1)
-        wave_file.setsampwidth(4)
-        wave_file.setframerate(44100)
-        wave_file.writeframes(concat_array)
+    # Job detail
 
-    annotation_audio_chunk_file_paths = chunk_annotation_audio(audio_file_path, temp_dir, annotations)
+    # Find labels of a particular job
+    job_details = Job.objects.get(id=job_id)
+    labels_queryset = job_details.get_labels()
+    labels_list = list(labels_queryset.values())
 
-    for i in range(0, len(annotation_audio_chunk_file_paths)):
-        final_data.append({"path" : os.path.basename(annotation_audio_chunk_file_paths[i]), "sentence" : annotations[i]["transcript"], "age" : annotations[i]["age"], "gender" : annotations[i]["gender"], "accents" : annotations[i]["accent"], "locale" : annotations[i]["locale"], "emotion" : annotations[i]["emotion"] })
+    labels_mapping = {}
+
+    for label in labels_list:
+        labels_mapping[label["id"]] = label
+
+        label_attributes_queryset = AttributeSpec.objects.filter(label=label["id"])
+
+        attributes_list = list(label_attributes_queryset.values())
 
+        labels_mapping[label["id"]]["attributes"] = {}
+
+        for attribute in attributes_list:
+            labels_mapping[label["id"]]["attributes"][attribute["id"]] = attribute
+
+        slogger.glob.debug("JOB LABELS ATTRIBUTES")
+        slogger.glob.debug(json.dumps(attributes_list))
+
+
+    slogger.glob.debug("JOB LABELS")
+    slogger.glob.debug(json.dumps(labels_list))
+
+    # audio_file_path = os.path.join(temp_dir, str(job_id) + ".wav")
+    # with wave.open(audio_file_path, 'wb') as wave_file:
+    #     wave_file.setnchannels(1)
+    #     wave_file.setsampwidth(4)
+    #     wave_file.setframerate(44100)
+    #     wave_file.writeframes(concat_array)
+
+    annotation_audio_chunk_file_paths = chunk_annotation_audio(concat_array, temp_dir, annotations)
+
+    for i in range(0, len(annotation_audio_chunk_file_paths)):
+        annotation_attribute_id = annotations[i]["attributes"][0]["spec_id"]
+        label_attributes = labels_mapping[annotations[i]["label_id"]]["attributes"]
+        annotation_attribute = label_attributes[annotation_attribute_id]
+        attribute_name = annotation_attribute["name"]
+        attribute_val = annotations[i]["attributes"][0]["value"]
+
+        final_data.append({"path" : os.path.basename(annotation_audio_chunk_file_paths[i]), "sentence" : annotations[i]["transcript"], "age" : annotations[i]["age"], "gender" : annotations[i]["gender"], "accents" : annotations[i]["accent"], "locale" : annotations[i]["locale"], "emotion" : annotations[i]["emotion"], "label" : labels_mapping[annotations[i]["label_id"]]["name"], "attribute_name" : attribute_name, "attribute_value" : attribute_val, "start" : annotations[i]["points"][0],  "end" : annotations[i]["points"][3]})
+
+    slogger.glob.debug("JOB ANNOTATION DATA")
+    slogger.glob.debug(json.dumps(final_data))
+    slogger.glob.debug("All  ANNOTATIONs DATA")
+    slogger.glob.debug(json.dumps(annotations))
     return final_data, annotation_audio_chunk_file_paths
 
 def convert_annotation_data_format(data, format_name):
     if format_name == "Common Voice":
         return data
     elif format_name == "Librispeech":
-        data = list(map(lambda x: {"chapter_id" : "", "file" : x["path"], "id" : str(uuid.uuid4()), "speaker_id" : "", "text" : x["sentence"]}, data))
+        data = list(map(lambda x: {"chapter_id" : "", "file" : x["path"], "id" : str(uuid.uuid4()), "speaker_id" : "", "text" : x["sentence"], "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))
     elif format_name == "VoxPopuli":
         language_id_mapping = {"en" : 0}
-        data = list(map(lambda x: {"audio_id" : str(uuid.uuid4()), "language" : language_id_mapping[x["locale"]] if language_id_mapping.get(x["locale"]) else None, "audio_path" : x["path"], "raw_text" : x["sentence"], "normalized_text" : x["sentence"], "gender" : x["gender"], "speaker_id" : "", "is_gold_transcript" : False, "accent" : x["accent"]}, data))
+        data = list(map(lambda x: {"audio_id" : str(uuid.uuid4()), "language" : language_id_mapping[x["locale"]] if language_id_mapping.get(x["locale"]) else None, "audio_path" : x["path"], "raw_text" : x["sentence"], "normalized_text" : x["sentence"], "gender" : x["gender"], "speaker_id" : "", "is_gold_transcript" : False, "accent" : x["accents"], "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))
     elif format_name == "Ted-Lium":
-        data = list(map(lambda x: {"file" : x["path"], "text" : x["sentence"], "gender" : x["gender"], "id" : str(uuid.uuid4()), "speaker_id" : ""}, data))
+        data = list(map(lambda x: {"file" : x["path"], "text" : x["sentence"], "gender" : x["gender"], "id" : str(uuid.uuid4()), "speaker_id" : "", "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))
 
     return data
 def export_audino_job(job_id, dst_file, format_name, server_url=None, save_images=False):
@@ -1074,6 +1111,9 @@ def export_audino_task(task_id, dst_file, format_name, server_url=None, save_ima
 
             final_data, annotation_audio_chunk_file_paths = get_audio_job_export_data(job.db_job.id, dst_file, job, temp_dir_base, temp_dir)
 
+            # Convert the data into a format
+            final_data = convert_annotation_data_format(final_data, format_name)
+
             final_task_data.append(final_data)
             final_annotation_chunk_paths.append(annotation_audio_chunk_file_paths)
 

diff --git a/cvat/apps/engine/backup.py b/cvat/apps/engine/backup.py
@@ -179,6 +179,7 @@ class _TaskBackupBase(_BackupBase):
     def _prepare_task_meta(self, task):
         allowed_fields = {
             'name',
+            'segment_duration',
             'bug_tracker',
             'status',
             'subset',
@@ -232,6 +233,12 @@ def _prepare_annotations(self, annotations, label_mapping):
             'attributes',
             'shapes',
             'elements',
+            'gender',
+            'age',
+            'accent',
+            'transcript',
+            'locale',
+            'emotion'
         }
 
         def _update_attribute(attribute, label):
@@ -328,6 +335,7 @@ def __init__(self, pk, version=Version.V1):
         self._db_task = models.Task.objects.prefetch_related('data__images', 'annotation_guide__assets').select_related('data__video', 'annotation_guide').get(pk=pk)
         self._db_data = self._db_task.data
         self._version = version
+        self.logger = slogger.task[pk]
 
         db_labels = (self._db_task.project if self._db_task.project_id else self._db_task).label_set.all().prefetch_related(
             'attributespec_set')
@@ -382,6 +390,8 @@ def _write_task(self, zip_object, target_dir=None):
     def _write_manifest(self, zip_object, target_dir=None):
         def serialize_task():
             task_serializer = TaskReadSerializer(self._db_task)
+            # self.logger.info("WRITE MANIFEST")
+            # self.logger.info(task_serializer.data)
             for field in ('url', 'owner', 'assignee'):
                 task_serializer.fields.pop(field)
 
@@ -641,6 +651,8 @@ def _write_data(zip_object):
         jobs = self._manifest.pop('jobs')
 
         self._prepare_task_meta(self._manifest)
+        self._logger.info("DEBUG IMPORT")
+        self._logger.info(self._manifest)
         self._manifest['owner_id'] = self._user_id
         self._manifest['project_id'] = self._project_id
 

diff --git a/cvat/apps/engine/media_extractors.py b/cvat/apps/engine/media_extractors.py
@@ -10,11 +10,12 @@
 import itertools
 import struct
 from enum import IntEnum
+import chardet
 from abc import ABC, abstractmethod
 from contextlib import closing
 from typing import Iterable
-from cvat.apps.engine.log import ServerLogManager
-slogger = ServerLogManager(__name__)
+# from cvat.apps.engine.log import ServerLogManager
+# slogger = ServerLogManager(__name__)
 
 import av
 import numpy as np
@@ -505,6 +506,26 @@ def _has_frame(self, i):
 
         return False
 
+    def get_total_frames(self):
+        total_frame = 0
+        with self._get_av_container() as container:
+            stream = container.streams.audio[0]
+            stream.thread_type = 'AUTO'
+            for packet in container.demux(stream):
+                for image in packet.decode():
+                    total_frame += 1
+
+        return total_frame
+
+    def get_file_encoding(self, file_path):
+
+        with open(file_path, 'rb') as f:
+            rawdata = f.read(1024)
+        result = chardet.detect(rawdata)
+        encoding = result['encoding']
+
+        return encoding
+
     def __iter__(self):
         with self._get_av_container() as container:
             stream = container.streams.audio[0]
@@ -523,7 +544,12 @@ def get_progress(self, pos):
     def _get_av_container(self):
         if isinstance(self._source_path[0], io.BytesIO):
             self._source_path[0].seek(0) # required for re-reading
-        return av.open(self._source_path[0])
+
+        encoding = self.get_file_encoding(self._source_path[0])
+        if encoding:
+            return av.open(self._source_path[0], metadata_encoding = encoding)
+        else:
+            return av.open(self._source_path[0])
 
     def _get_duration(self):
         with self._get_av_container() as container:
@@ -543,25 +569,24 @@ def _get_duration(self):
 
     def get_preview(self, frame):
         with self._get_av_container() as container:
-            stream = container.streams.video[0]
+            stream = container.streams.audio[0]
             tb_denominator = stream.time_base.denominator
             needed_time = int((frame / stream.guessed_rate) * tb_denominator)
             container.seek(offset=needed_time, stream=stream)
             for packet in container.demux(stream):
                 for frame in packet.decode():
                     return self._get_preview(frame.to_image() if not stream.metadata.get('rotate') \
-                        else av.VideoFrame().from_ndarray(
+                        else av.AudioFrame().from_ndarray(
                             rotate_image(
                                 frame.to_ndarray(format='bgr24'),
-                                360 - int(container.streams.video[0].metadata.get('rotate'))
+                                360 - int(container.streams.audio[0].metadata.get('rotate'))
                             ),
                             format ='bgr24'
                         ).to_image()
                     )
 
     def get_image_size(self, i):
-        image = (next(iter(self)))[0]
-        return image.width, image.height
+        return 1, 1
 
 class FragmentMediaReader:
     def __init__(self, chunk_number, chunk_size, start, stop, step=1):
@@ -953,44 +978,6 @@ def save_as_chunk(self, images, chunk_path):
             self._encode_images(images, output_container, output_v_stream)
         return [(input_w, input_h)]
 
-class AudioCompressedChunkWriter(AudioChunkWriter):
-    def __init__(self, quality):
-        super().__init__(quality)
-        if self._codec_name == 'libx264':
-            self._codec_opts = {
-                'profile': 'baseline',
-                'coder': '0',
-                'crf': str(self._image_quality),
-                'wpredp': '0',
-                'flags': '-loop',
-            }
-
-    def save_as_chunk(self, images, chunk_path):
-        if not images:
-            raise Exception('no images to save')
-
-        input_w = images[0][0].width
-        input_h = images[0][0].height
-
-        downscale_factor = 1
-        while input_h / downscale_factor >= 1080:
-            downscale_factor *= 2
-
-        output_h = input_h // downscale_factor
-        output_w = input_w // downscale_factor
-
-        with av.open(chunk_path, 'w', format=self.FORMAT) as output_container:
-            output_v_stream = self._add_video_stream(
-                container=output_container,
-                w=output_w,
-                h=output_h,
-                rate=self._output_fps,
-                options=self._codec_opts,
-            )
-
-            self._encode_images(images, output_container, output_v_stream)
-        return [(input_w, input_h)]
-
 def _is_archive(path):
     mime = mimetypes.guess_type(path)
     mime_type = mime[0]
@@ -1043,18 +1030,18 @@ def _is_zip(path):
         'mode': 'annotation',
         'unique': False,
     },
-    'video': {
-        'has_mime_type': _is_video,
-        'extractor': VideoReader,
-        'mode': 'interpolation',
-        'unique': True,
-    },
     'audio': {
         'has_mime_type': _is_audio,
         'extractor': AudioReader,
         'mode': 'interpolation',
         'unique': False,
     },
+    'video': {
+        'has_mime_type': _is_video,
+        'extractor': VideoReader,
+        'mode': 'interpolation',
+        'unique': True,
+    },
     'archive': {
         'has_mime_type': _is_archive,
         'extractor': ArchiveReader,

diff --git a/cvat/apps/engine/migrations/0084_job_ai_audio_annotation_error_msg_and_more.py b/cvat/apps/engine/migrations/0084_job_ai_audio_annotation_error_msg_and_more.py
@@ -1,4 +1,4 @@
-# Generated by Django 4.2.6 on 2024-04-15 05:10
+# Generated by Django 4.2.6 on 2024-04-19 09:27
 
 import cvat.apps.engine.models
 from django.db import migrations, models