Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/audino v2 #5

Closed
wants to merge 32 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
0e42dae
bug fix
kushalpoddar Apr 20, 2024
6589411
Email templating changes
kushalpoddar Apr 26, 2024
2d29065
Added pydub
kushalpoddar May 6, 2024
2da6517
alter col
kushalpoddar May 6, 2024
74e6743
added new audio total durstion field
kushalpoddar May 6, 2024
6a602b6
Updated migrations
kushalpoddar May 8, 2024
97a158b
Audio duration finding using av
kushalpoddar May 9, 2024
bfaebcc
local
kushalpoddar May 9, 2024
531de53
Added UI Url
kushalpoddar May 9, 2024
44df236
Bug fix
kushalpoddar May 9, 2024
c696325
Bug fix
kushalpoddar May 10, 2024
5c8895a
readded pydub
kushalpoddar May 10, 2024
0799c54
bug fix for av
kushalpoddar May 10, 2024
9991d03
Voxpopuli bug fix and segment size rechanged
kushalpoddar May 21, 2024
16e0509
added email notification when annotation is done
ashish7515 May 26, 2024
f3beafa
Bug fixes
kushalpoddar May 29, 2024
111d2b6
resolve bug for large files
ashish7515 May 30, 2024
32a065b
merge conflicts
ashish7515 May 30, 2024
0bde3ff
update
ashish7515 May 30, 2024
62e94bb
bug fix
kushalpoddar May 30, 2024
0ff4e30
Encoding bugresolved for audio
kushalpoddar May 30, 2024
4cc294c
Added chardet to requirements
kushalpoddar May 30, 2024
e2b4c07
bug fix for audios
kushalpoddar May 30, 2024
9b7d1ac
Merge pull request #2 from midas-research/my-branch
rohan220217 Jun 3, 2024
030e245
start end in download csv, mp3 format, typo error
ashish7515 Jun 5, 2024
ebe46a5
comment removed
ashish7515 Jun 6, 2024
2cf5d5a
Merge pull request #3 from midas-research/my-branch
rohan220217 Jun 6, 2024
a52c010
Merge branch 'feat/audino-v2' of https://github.com/midas-research/cv…
ashish7515 Jun 15, 2024
f23cd60
conflict resolved
ashish7515 Jun 15, 2024
a628c20
Merge pull request #4 from midas-research/feat/ground_truth
rohan220217 Jun 15, 2024
95253ff
rebased
kushalpoddar Jun 16, 2024
6779ccf
Bug fixes
kushalpoddar Jun 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 69 additions & 29 deletions cvat/apps/dataset_manager/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import os
import json
import zipfile
from pydub import AudioSegment
from scipy.io import wavfile
import numpy as np
from collections import OrderedDict
Expand All @@ -19,17 +20,17 @@
from tempfile import TemporaryDirectory
from datumaro.components.errors import DatasetError, DatasetImportError, DatasetNotFoundError

from django.conf import settings
# from django.conf import settings
from django.db import transaction
from django.db.models.query import Prefetch
from cvat.apps.engine.models import Job, AttributeSpec
from django.utils import timezone
from rest_framework.exceptions import ValidationError

from cvat.apps.engine import models, serializers
from cvat.apps.engine.plugins import plugin_decorator
from cvat.apps.events.handlers import handle_annotations_change
from cvat.apps.profiler import silk_profile
from cvat.apps.engine.cache import MediaCache
from cvat.apps.engine.frame_provider import FrameProvider
from cvat.apps.dataset_manager.annotation import AnnotationIR, AnnotationManager
from cvat.apps.dataset_manager.bindings import TaskData, JobData, CvatImportError
Expand Down Expand Up @@ -878,9 +879,6 @@ def jobChunkPathGetter(db_data, start, stop, task_dimension, data_quality, data_
# db_data = Task Data
frame_provider = FrameProvider(db_data, task_dimension)

start_chunk = frame_provider.get_chunk_number(start)
stop_chunk = frame_provider.get_chunk_number(stop)

# self.type = data_type
number = int(data_num) if data_num is not None else None

Expand All @@ -894,15 +892,20 @@ def jobChunkPathGetter(db_data, start, stop, task_dimension, data_quality, data_

return path

def chunk_annotation_audio(audio_file, output_folder, annotations):
# Load audio
# y, sr = librosa.load(audio_file, sr=None)
sr, y = wavfile.read(audio_file)
def chunk_annotation_audio(concat_array, output_folder, annotations):
# Convert NumPy array to AudioSegment
sr = 44100 # sampling rate
audio_segment = AudioSegment(concat_array.tobytes(), frame_rate=sr, channels=1, sample_width=4)

try:
y = audio_segment.get_array_of_samples()
except Exception as e:
return None

data = []
# Loop over shapes

for i, shape in enumerate(annotations, 1):
# Extract transcript and time points

start_time = min(shape['points'][:2])
end_time = max(shape['points'][2:])

Expand All @@ -914,14 +917,11 @@ def chunk_annotation_audio(audio_file, output_folder, annotations):
chunk = y[start_sample:end_sample]

clip_uuid = str(uuid.uuid4())
# Save the chunk with transcript as filename
output_file = os.path.join(output_folder, f"{clip_uuid}.wav")
output_file = os.path.join(output_folder, f"{clip_uuid}.mp3")
soundfile.write(output_file, chunk, sr)

data.append(output_file)

# logger.info(f"Annotation {str(i)} Chunk saved: {output_file}")

return data

def create_annotation_clips_zip(annotation_audio_chunk_file_paths, meta_data_file_path, output_folder, dst_file):
Expand Down Expand Up @@ -959,15 +959,14 @@ def get_np_audio_array_from_job(job_id):

job_data_chunk_size = job.db_job.segment.task.data.chunk_size
task_dimension = job.db_job.segment.task.dimension
storage_method = job.db_job.segment.task.data.storage_method

start = job.start_frame/job_data_chunk_size
stop = job.stop_frame/job_data_chunk_size

audio_array_buffer = []
for i in range(math.trunc(start), math.trunc(stop)+1):
db_job = job.db_job
data_type = "chunk"
# data_type = "chunk"
data_num = i
data_quality = 'compressed'

Expand All @@ -993,30 +992,68 @@ def get_audio_job_export_data(job_id, dst_file, job, temp_dir_base, temp_dir):
# All Annotations
annotations = job.data["shapes"]

audio_file_path = os.path.join(temp_dir, str(job_id) + ".wav")
with wave.open(audio_file_path, 'wb') as wave_file:
wave_file.setnchannels(1)
wave_file.setsampwidth(4)
wave_file.setframerate(44100)
wave_file.writeframes(concat_array)
# Job detail

annotation_audio_chunk_file_paths = chunk_annotation_audio(audio_file_path, temp_dir, annotations)
# Find labels of a particular job
job_details = Job.objects.get(id=job_id)
labels_queryset = job_details.get_labels()
labels_list = list(labels_queryset.values())

for i in range(0, len(annotation_audio_chunk_file_paths)):
final_data.append({"path" : os.path.basename(annotation_audio_chunk_file_paths[i]), "sentence" : annotations[i]["transcript"], "age" : annotations[i]["age"], "gender" : annotations[i]["gender"], "accents" : annotations[i]["accent"], "locale" : annotations[i]["locale"], "emotion" : annotations[i]["emotion"] })
labels_mapping = {}

for label in labels_list:
labels_mapping[label["id"]] = label

label_attributes_queryset = AttributeSpec.objects.filter(label=label["id"])

attributes_list = list(label_attributes_queryset.values())

labels_mapping[label["id"]]["attributes"] = {}

for attribute in attributes_list:
labels_mapping[label["id"]]["attributes"][attribute["id"]] = attribute

slogger.glob.debug("JOB LABELS ATTRIBUTES")
slogger.glob.debug(json.dumps(attributes_list))


slogger.glob.debug("JOB LABELS")
slogger.glob.debug(json.dumps(labels_list))

# audio_file_path = os.path.join(temp_dir, str(job_id) + ".wav")
# with wave.open(audio_file_path, 'wb') as wave_file:
# wave_file.setnchannels(1)
# wave_file.setsampwidth(4)
# wave_file.setframerate(44100)
# wave_file.writeframes(concat_array)

annotation_audio_chunk_file_paths = chunk_annotation_audio(concat_array, temp_dir, annotations)

for i in range(0, len(annotation_audio_chunk_file_paths)):
annotation_attribute_id = annotations[i]["attributes"][0]["spec_id"]
label_attributes = labels_mapping[annotations[i]["label_id"]]["attributes"]
annotation_attribute = label_attributes[annotation_attribute_id]
attribute_name = annotation_attribute["name"]
attribute_val = annotations[i]["attributes"][0]["value"]

final_data.append({"path" : os.path.basename(annotation_audio_chunk_file_paths[i]), "sentence" : annotations[i]["transcript"], "age" : annotations[i]["age"], "gender" : annotations[i]["gender"], "accents" : annotations[i]["accent"], "locale" : annotations[i]["locale"], "emotion" : annotations[i]["emotion"], "label" : labels_mapping[annotations[i]["label_id"]]["name"], "attribute_name" : attribute_name, "attribute_value" : attribute_val, "start" : annotations[i]["points"][0], "end" : annotations[i]["points"][3]})

slogger.glob.debug("JOB ANNOTATION DATA")
slogger.glob.debug(json.dumps(final_data))
slogger.glob.debug("All ANNOTATIONs DATA")
slogger.glob.debug(json.dumps(annotations))
return final_data, annotation_audio_chunk_file_paths

def convert_annotation_data_format(data, format_name):
if format_name == "Common Voice":
return data
elif format_name == "Librispeech":
data = list(map(lambda x: {"chapter_id" : "", "file" : x["path"], "id" : str(uuid.uuid4()), "speaker_id" : "", "text" : x["sentence"]}, data))
data = list(map(lambda x: {"chapter_id" : "", "file" : x["path"], "id" : str(uuid.uuid4()), "speaker_id" : "", "text" : x["sentence"], "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))
elif format_name == "VoxPopuli":
language_id_mapping = {"en" : 0}
data = list(map(lambda x: {"audio_id" : str(uuid.uuid4()), "language" : language_id_mapping[x["locale"]] if language_id_mapping.get(x["locale"]) else None, "audio_path" : x["path"], "raw_text" : x["sentence"], "normalized_text" : x["sentence"], "gender" : x["gender"], "speaker_id" : "", "is_gold_transcript" : False, "accent" : x["accent"]}, data))
data = list(map(lambda x: {"audio_id" : str(uuid.uuid4()), "language" : language_id_mapping[x["locale"]] if language_id_mapping.get(x["locale"]) else None, "audio_path" : x["path"], "raw_text" : x["sentence"], "normalized_text" : x["sentence"], "gender" : x["gender"], "speaker_id" : "", "is_gold_transcript" : False, "accent" : x["accents"], "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))
elif format_name == "Ted-Lium":
data = list(map(lambda x: {"file" : x["path"], "text" : x["sentence"], "gender" : x["gender"], "id" : str(uuid.uuid4()), "speaker_id" : ""}, data))
data = list(map(lambda x: {"file" : x["path"], "text" : x["sentence"], "gender" : x["gender"], "id" : str(uuid.uuid4()), "speaker_id" : "", "label" : x["label"], "attribute_name" : x["attribute_name"], "attribute_value" : x["attribute_value"], "start" : x["start"], "end" : x["end"]}, data))

return data
def export_audino_job(job_id, dst_file, format_name, server_url=None, save_images=False):
Expand Down Expand Up @@ -1074,6 +1111,9 @@ def export_audino_task(task_id, dst_file, format_name, server_url=None, save_ima

final_data, annotation_audio_chunk_file_paths = get_audio_job_export_data(job.db_job.id, dst_file, job, temp_dir_base, temp_dir)

# Convert the data into a format
final_data = convert_annotation_data_format(final_data, format_name)

final_task_data.append(final_data)
final_annotation_chunk_paths.append(annotation_audio_chunk_file_paths)

Expand Down
12 changes: 12 additions & 0 deletions cvat/apps/engine/backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ class _TaskBackupBase(_BackupBase):
def _prepare_task_meta(self, task):
allowed_fields = {
'name',
'segment_duration',
'bug_tracker',
'status',
'subset',
Expand Down Expand Up @@ -232,6 +233,12 @@ def _prepare_annotations(self, annotations, label_mapping):
'attributes',
'shapes',
'elements',
'gender',
'age',
'accent',
'transcript',
'locale',
'emotion'
}

def _update_attribute(attribute, label):
Expand Down Expand Up @@ -328,6 +335,7 @@ def __init__(self, pk, version=Version.V1):
self._db_task = models.Task.objects.prefetch_related('data__images', 'annotation_guide__assets').select_related('data__video', 'annotation_guide').get(pk=pk)
self._db_data = self._db_task.data
self._version = version
self.logger = slogger.task[pk]

db_labels = (self._db_task.project if self._db_task.project_id else self._db_task).label_set.all().prefetch_related(
'attributespec_set')
Expand Down Expand Up @@ -382,6 +390,8 @@ def _write_task(self, zip_object, target_dir=None):
def _write_manifest(self, zip_object, target_dir=None):
def serialize_task():
task_serializer = TaskReadSerializer(self._db_task)
# self.logger.info("WRITE MANIFEST")
# self.logger.info(task_serializer.data)
for field in ('url', 'owner', 'assignee'):
task_serializer.fields.pop(field)

Expand Down Expand Up @@ -641,6 +651,8 @@ def _write_data(zip_object):
jobs = self._manifest.pop('jobs')

self._prepare_task_meta(self._manifest)
self._logger.info("DEBUG IMPORT")
self._logger.info(self._manifest)
self._manifest['owner_id'] = self._user_id
self._manifest['project_id'] = self._project_id

Expand Down
91 changes: 39 additions & 52 deletions cvat/apps/engine/media_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@
import itertools
import struct
from enum import IntEnum
import chardet
from abc import ABC, abstractmethod
from contextlib import closing
from typing import Iterable
from cvat.apps.engine.log import ServerLogManager
slogger = ServerLogManager(__name__)
# from cvat.apps.engine.log import ServerLogManager
# slogger = ServerLogManager(__name__)

import av
import numpy as np
Expand Down Expand Up @@ -505,6 +506,26 @@ def _has_frame(self, i):

return False

def get_total_frames(self):
total_frame = 0
with self._get_av_container() as container:
stream = container.streams.audio[0]
stream.thread_type = 'AUTO'
for packet in container.demux(stream):
for image in packet.decode():
total_frame += 1

return total_frame

def get_file_encoding(self, file_path):

with open(file_path, 'rb') as f:
rawdata = f.read(1024)
result = chardet.detect(rawdata)
encoding = result['encoding']

return encoding

def __iter__(self):
with self._get_av_container() as container:
stream = container.streams.audio[0]
Expand All @@ -523,7 +544,12 @@ def get_progress(self, pos):
def _get_av_container(self):
if isinstance(self._source_path[0], io.BytesIO):
self._source_path[0].seek(0) # required for re-reading
return av.open(self._source_path[0])

encoding = self.get_file_encoding(self._source_path[0])
if encoding:
return av.open(self._source_path[0], metadata_encoding = encoding)
else:
return av.open(self._source_path[0])

def _get_duration(self):
with self._get_av_container() as container:
Expand All @@ -543,25 +569,24 @@ def _get_duration(self):

def get_preview(self, frame):
with self._get_av_container() as container:
stream = container.streams.video[0]
stream = container.streams.audio[0]
tb_denominator = stream.time_base.denominator
needed_time = int((frame / stream.guessed_rate) * tb_denominator)
container.seek(offset=needed_time, stream=stream)
for packet in container.demux(stream):
for frame in packet.decode():
return self._get_preview(frame.to_image() if not stream.metadata.get('rotate') \
else av.VideoFrame().from_ndarray(
else av.AudioFrame().from_ndarray(
rotate_image(
frame.to_ndarray(format='bgr24'),
360 - int(container.streams.video[0].metadata.get('rotate'))
360 - int(container.streams.audio[0].metadata.get('rotate'))
),
format ='bgr24'
).to_image()
)

def get_image_size(self, i):
image = (next(iter(self)))[0]
return image.width, image.height
return 1, 1

class FragmentMediaReader:
def __init__(self, chunk_number, chunk_size, start, stop, step=1):
Expand Down Expand Up @@ -953,44 +978,6 @@ def save_as_chunk(self, images, chunk_path):
self._encode_images(images, output_container, output_v_stream)
return [(input_w, input_h)]

class AudioCompressedChunkWriter(AudioChunkWriter):
def __init__(self, quality):
super().__init__(quality)
if self._codec_name == 'libx264':
self._codec_opts = {
'profile': 'baseline',
'coder': '0',
'crf': str(self._image_quality),
'wpredp': '0',
'flags': '-loop',
}

def save_as_chunk(self, images, chunk_path):
if not images:
raise Exception('no images to save')

input_w = images[0][0].width
input_h = images[0][0].height

downscale_factor = 1
while input_h / downscale_factor >= 1080:
downscale_factor *= 2

output_h = input_h // downscale_factor
output_w = input_w // downscale_factor

with av.open(chunk_path, 'w', format=self.FORMAT) as output_container:
output_v_stream = self._add_video_stream(
container=output_container,
w=output_w,
h=output_h,
rate=self._output_fps,
options=self._codec_opts,
)

self._encode_images(images, output_container, output_v_stream)
return [(input_w, input_h)]

def _is_archive(path):
mime = mimetypes.guess_type(path)
mime_type = mime[0]
Expand Down Expand Up @@ -1043,18 +1030,18 @@ def _is_zip(path):
'mode': 'annotation',
'unique': False,
},
'video': {
'has_mime_type': _is_video,
'extractor': VideoReader,
'mode': 'interpolation',
'unique': True,
},
'audio': {
'has_mime_type': _is_audio,
'extractor': AudioReader,
'mode': 'interpolation',
'unique': False,
},
'video': {
'has_mime_type': _is_video,
'extractor': VideoReader,
'mode': 'interpolation',
'unique': True,
},
'archive': {
'has_mime_type': _is_archive,
'extractor': ArchiveReader,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 4.2.6 on 2024-04-15 05:10
# Generated by Django 4.2.6 on 2024-04-19 09:27

import cvat.apps.engine.models
from django.db import migrations, models
Expand Down
Loading