From bcfef061edfacaa6666da2da5df1e9063826b758 Mon Sep 17 00:00:00 2001 From: maya Date: Tue, 19 Nov 2024 18:36:40 +0100 Subject: [PATCH 01/32] Draft fix --- cvat/apps/dataset_manager/views.py | 8 ++++++-- cvat/rqworker.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 1dbff55ed08d..f7dae62a6ab0 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -26,7 +26,7 @@ LockNotAvailableError, current_function_name, get_export_cache_lock, get_export_cache_dir, make_export_filename, - parse_export_file_path + parse_export_file_path, make_export_cache_lock_key ) from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import @@ -133,11 +133,15 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No os.makedirs(cache_dir, exist_ok=True) + current_rq_job = rq.get_current_job() + current_rq_job.meta['lock_key'] = make_export_cache_lock_key(output_path) + current_rq_job.save_meta() + with get_export_cache_lock( output_path, block=True, acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, - ttl=rq.get_current_job().timeout, + ttl=current_rq_job.timeout, ): if not osp.exists(output_path): with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: diff --git a/cvat/rqworker.py b/cvat/rqworker.py index 8a3e187b74b0..a48cdc265646 100644 --- a/cvat/rqworker.py +++ b/cvat/rqworker.py @@ -6,6 +6,9 @@ import os from rq import Worker +from rq.job import Job +from resource import struct_rusage +import signal import cvat.utils.remote_debugger as debug @@ -67,6 +70,18 @@ def execute_job(self, *args, **kwargs): DefaultWorker = RemoteDebugWorker +def handle_work_horse_killed(job: Job, retpid: int, ret_val: int, rusage: struct_rusage): + job.refresh() + if ret_val in (signal.SIGTERM, signal.SIGKILL) and (lock_key := job.meta.get('lock_key')): + job.connection.delete(lock_key) + + +class WorkerWithCustomHandler(DefaultWorker): + # https://github.com/rq/django-rq/issues/579 + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._work_horse_killed_handler = handle_work_horse_killed + if os.environ.get("COVERAGE_PROCESS_START"): import coverage From 15c3cb9f48bde9717e3d47e9f166675077743849 Mon Sep 17 00:00:00 2001 From: maya Date: Tue, 26 Nov 2024 13:23:20 +0100 Subject: [PATCH 02/32] Add a separate thread to prolong export lock --- cvat/apps/dataset_manager/default_settings.py | 13 +- cvat/apps/dataset_manager/util.py | 8 + cvat/apps/dataset_manager/views.py | 209 ++++++++++++++---- cvat/rqworker.py | 29 ++- 4 files changed, 206 insertions(+), 53 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index a4dd53b0f52e..73e9d94e7457 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -3,12 +3,23 @@ # SPDX-License-Identifier: MIT import os +import warnings DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 24)) "Base lifetime for cached exported datasets, in seconds" -DATASET_CACHE_LOCK_TIMEOUT = int(os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT", 10)) +DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") "Timeout for cache lock acquiring, in seconds" +if DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT is not None: + DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = int(DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT) + warnings.warn( + "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " + "use DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT instead", DeprecationWarning) +else: + DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = int(os.getenv("DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT", 10)) + +DATASET_EXPORT_LOCK_TTL = int(os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL", 60 * 5)) + DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) "Retry interval for cases the export cache lock was unavailable, in seconds" diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 0193748446f3..0d0b7c1478c5 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -99,6 +99,9 @@ def faster_deepcopy(v): class LockNotAvailableError(Exception): pass +class ProlongLockError(Exception): + pass + def make_export_cache_lock_key(filename: os.PathLike[str]) -> str: return f"export_lock:{os.fspath(filename)}" @@ -111,6 +114,7 @@ def get_export_cache_lock( ttl: int | timedelta, block: bool = True, acquire_timeout: Optional[int | timedelta] = None, + num_extensions: int | None = None, ) -> Generator[Lock, Any, Any]: if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() @@ -124,6 +128,9 @@ def get_export_cache_lock( if not ttl or ttl < 0: raise ValueError("ttl must be a non-negative number") + if num_extensions and num_extensions < 0: + raise ValueError("num_extensions must be a non-negative number") + # https://redis.io/docs/latest/develop/use/patterns/distributed-locks/ # The lock is exclusive, so it may potentially reduce performance in some cases, # where parallel access is potentially possible and valid, @@ -132,6 +139,7 @@ def get_export_cache_lock( key=make_export_cache_lock_key(export_path), masters={django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value)}, auto_release_time=ttl, + **({"num_extensions": num_extensions} if num_extensions is not None else {}), ) acquired = lock.acquire(blocking=block, timeout=acquire_timeout) try: diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index f7dae62a6ab0..5c8f17e653a2 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -8,9 +8,11 @@ import os.path as osp import tempfile from datetime import timedelta +import math import django_rq import rq +import threading from django.conf import settings from django.utils import timezone from rq_scheduler import Scheduler @@ -23,12 +25,18 @@ from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS from .util import ( - LockNotAvailableError, + LockNotAvailableError, ProlongLockError, current_function_name, get_export_cache_lock, get_export_cache_dir, make_export_filename, parse_export_file_path, make_export_cache_lock_key ) from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import +from pottery.redlock import Redlock +from pottery.exceptions import TooManyExtensions +from typing import Any, Callable + +from time import sleep +import signal slogger = ServerLogManager(__name__) @@ -50,8 +58,10 @@ def log_exception(logger=None, exc_info=True): 'job': JOB_CACHE_TTL, } -EXPORT_CACHE_LOCK_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_TIMEOUT) +EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT) EXPORT_LOCKED_RETRY_INTERVAL = timedelta(seconds=settings.DATASET_EXPORT_LOCKED_RETRY_INTERVAL) +EXPORT_LOCK_TTL = settings.DATASET_EXPORT_LOCK_TTL +EXPORT_LOCK_EXTEND_INTERVAL = EXPORT_LOCK_TTL - 2 # todo: move into const def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: @@ -96,7 +106,104 @@ def _patched_retry(*_1, **_2): setattr(current_rq_job, 'retry', _patched_retry) return current_rq_job -def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=None, save_images=False): + +class ProlongLockThread(threading.Thread): + def __init__( + self, + *, + lock: Redlock, + lock_extend_interval: int, + stop_event: threading.Event, + **kwargs, + ): + self.lock = lock + self.lock_extend_interval = lock_extend_interval + self.cur_sleep_interval = lock_extend_interval + self.stop_event = stop_event + self.logger = ServerLogManager(__name__) + self.max_retry_attempt_count = 3 + super().__init__(**kwargs, target=self._prolong_lock) + + def _reset(self): + self.cur_sleep_interval = self.lock_extend_interval + + def _prolong_lock(self): + """ + Prolong the lock's TTL every seconds until is set. + The stop event is checked every second to minimize waiting time when the export process is completed. + """ + + self.logger.glob.debug("The prolong_lock is called") + + while not self.stop_event.is_set(): + sleep(1) + self.cur_sleep_interval -= 1 + + if self.cur_sleep_interval: + continue + + self.logger.glob.debug( + f"Extend lock {self.lock.key}, number of remaining extensions: {self.lock.num_extensions - self.lock._extension_num}" + ) + for attempt_number in range(1, self.max_retry_attempt_count + 1): + try: + # raise Exception('Ooops') + self.lock.extend() + self._reset() + except Exception as ex: + self.logger.glob.exception( + f"Attempt number: {attempt_number}, " + f"an exception occurred during lock {self.lock.key} extension: ", + str(ex), + ) + if attempt_number == self.max_retry_attempt_count: + self.stop_event.set() + return + + +class ExportThread(threading.Thread): + def __init__( + self, + cache_dir: str, + output_path: str, + instance_id: int, + export_fn: Callable[..., None], + server_url: str | None, + save_images: bool, + dst_format: str, + ): + self.cache_dir = cache_dir + self.output_path = output_path + self.instance_id = instance_id + self.export_fn = export_fn + self.server_url = server_url + self.save_images = save_images + self.dst_format = dst_format + super().__init__(target=self._export_dataset) + + def _export_dataset(self): + # sleep(10) + with tempfile.TemporaryDirectory(dir=self.cache_dir) as temp_dir: + temp_file = osp.join(temp_dir, "result") + self.export_fn( + self.instance_id, + temp_file, + self.dst_format, + server_url=self.server_url, + save_images=self.save_images, + ) + os.replace(temp_file, self.output_path) + + +def export( + *, + dst_format: str, + project_id: int | None = None, + task_id: int | None = None, + job_id: int | None = None, + server_url: str | None = None, + save_images: bool = False, +): try: if task_id is not None: logger = slogger.task[task_id] @@ -134,44 +241,72 @@ def export(dst_format, project_id=None, task_id=None, job_id=None, server_url=No os.makedirs(cache_dir, exist_ok=True) current_rq_job = rq.get_current_job() - current_rq_job.meta['lock_key'] = make_export_cache_lock_key(output_path) + current_rq_job.meta["lock_key"] = make_export_cache_lock_key(output_path) current_rq_job.save_meta() + # It make sense to acquire a lock if real export process is running and we are writing data into a file + # If such a file exists, we can just return the path without acquiring a lock + if osp.exists(output_path): + return output_path + + stop_event = threading.Event() + with get_export_cache_lock( output_path, block=True, - acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, - ttl=current_rq_job.timeout, - ): - if not osp.exists(output_path): - with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: - temp_file = osp.join(temp_dir, 'result') - export_fn(db_instance.id, temp_file, dst_format, - server_url=server_url, save_images=save_images) - os.replace(temp_file, output_path) - - scheduler: Scheduler = django_rq.get_scheduler( - settings.CVAT_QUEUES.EXPORT_DATA.value - ) - cleaning_job = scheduler.enqueue_in( - time_delta=cache_ttl, - func=clear_export_cache, - file_path=output_path, - file_ctime=instance_update_time.timestamp(), - logger=logger - ) - logger.info( - "The {} '{}' is exported as '{}' at '{}' " - "and available for downloading for the next {}. " - "Export cache cleaning job is enqueued, id '{}'".format( - db_instance.__class__.__name__.lower(), - db_instance.name if isinstance( - db_instance, (Project, Task) - ) else db_instance.id, - dst_format, output_path, cache_ttl, - cleaning_job.id - ) - ) + acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, + ttl=EXPORT_LOCK_TTL, + num_extensions=math.ceil(rq.get_current_job().timeout / EXPORT_LOCK_EXTEND_INTERVAL), + ) as red_lock: + extend_lock_thread = ProlongLockThread( + lock=red_lock, + lock_extend_interval=EXPORT_LOCK_EXTEND_INTERVAL, + stop_event=stop_event, + ) + extend_lock_thread.start() + + print(f"current pid: {os.getpid()}") + + export_thread = ExportThread( + cache_dir, + output_path, + db_instance.id, + export_fn, + server_url, + save_images, + dst_format, + ) + export_thread.start() + + while export_thread.is_alive(): + if stop_event.is_set(): + raise ProlongLockError("Export aborted because the lock extension failed.") + sleep(5) # todo: move into const + + export_thread.join() + stop_event.set() + extend_lock_thread.join() + + scheduler: Scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.EXPORT_DATA.value) + cleaning_job = scheduler.enqueue_in( + time_delta=cache_ttl, + func=clear_export_cache, + file_path=output_path, + file_ctime=instance_update_time.timestamp(), + logger=logger, + ) + logger.info( + "The {} '{}' is exported as '{}' at '{}' " + "and available for downloading for the next {}. " + "Export cache cleaning job is enqueued, id '{}'".format( + db_instance.__class__.__name__.lower(), + db_instance.name if isinstance(db_instance, (Project, Task)) else db_instance.id, + dst_format, + output_path, + cache_ttl, + cleaning_job.id, + ) + ) return output_path except LockNotAvailableError: @@ -216,7 +351,7 @@ def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger with get_export_cache_lock( file_path, block=True, - acquire_timeout=EXPORT_CACHE_LOCK_TIMEOUT, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, ttl=rq.get_current_job().timeout, ): if not osp.exists(file_path): diff --git a/cvat/rqworker.py b/cvat/rqworker.py index a48cdc265646..548dd9a4688a 100644 --- a/cvat/rqworker.py +++ b/cvat/rqworker.py @@ -13,9 +13,6 @@ import cvat.utils.remote_debugger as debug -DefaultWorker = Worker - - class BaseDeathPenalty: def __init__(self, timeout, exception, **kwargs): pass @@ -26,6 +23,20 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): pass +def handle_work_horse_killed(job: Job, retpid: int, ret_val: int, rusage: struct_rusage): + print('IMPORTANT: handle_work_horse_killed') + job.refresh() + if ret_val in (signal.SIGTERM, signal.SIGINT) and (lock_key := job.meta.get('lock_key')): + # todo: check lock ttl + job.connection.delete(lock_key) + +class WorkerWithCustomHandler(Worker): + # https://github.com/rq/django-rq/issues/579 + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._work_horse_killed_handler = handle_work_horse_killed + +DefaultWorker = WorkerWithCustomHandler class SimpleWorker(Worker): """ @@ -70,18 +81,6 @@ def execute_job(self, *args, **kwargs): DefaultWorker = RemoteDebugWorker -def handle_work_horse_killed(job: Job, retpid: int, ret_val: int, rusage: struct_rusage): - job.refresh() - if ret_val in (signal.SIGTERM, signal.SIGKILL) and (lock_key := job.meta.get('lock_key')): - job.connection.delete(lock_key) - - -class WorkerWithCustomHandler(DefaultWorker): - # https://github.com/rq/django-rq/issues/579 - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._work_horse_killed_handler = handle_work_horse_killed - if os.environ.get("COVERAGE_PROCESS_START"): import coverage From c4ef5026eb37ce6e7af1871f9c8956027571a239 Mon Sep 17 00:00:00 2001 From: maya Date: Tue, 26 Nov 2024 18:23:57 +0100 Subject: [PATCH 03/32] Set job status to SCHEDULED manually && remove job from rq:scheduler:scheduled_jobs on cancel --- cvat/apps/dataset_manager/views.py | 38 +++++++++++++++++++----------- cvat/apps/engine/background.py | 23 ++++++++++++------ 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 5c8f17e653a2..5a28018f0324 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -70,6 +70,14 @@ def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: return TTL_CONSTS[db_instance.lower()] +def _patch_scheduled_job_status(job: rq.job.Job): + # NOTE: rq scheduler < 0.14 does not set the appropriate + # job status SCHEDULED. It has been fixed in the 0.14 version + # https://github.com/rq/rq-scheduler/blob/f7d5787c5f94b5517e209c612ef648f4bfc44f9e/rq_scheduler/scheduler.py#L148 + # FUTURE-TODO: delete manual status setting after upgrading to 0.14 + if job.get_status(refresh=False) != rq.job.JobStatus.SCHEDULED: + job.set_status(rq.job.JobStatus.SCHEDULED) + def _retry_current_rq_job(time_delta: timedelta) -> rq.job.Job: # TODO: implement using retries once we move from rq_scheduler to builtin RQ scheduler # for better reliability and error reporting @@ -88,7 +96,7 @@ def _patched_retry(*_1, **_2): user_id = current_rq_job.meta.get('user', {}).get('id') or -1 with get_rq_lock_by_user(settings.CVAT_QUEUES.EXPORT_DATA.value, user_id): - scheduler.enqueue_in( + scheduled_rq_job: rq.job.Job = scheduler.enqueue_in( time_delta, current_rq_job.func, *current_rq_job.args, @@ -101,6 +109,7 @@ def _patched_retry(*_1, **_2): on_success=current_rq_job.success_callback, on_failure=current_rq_job.failure_callback, ) + _patch_scheduled_job_status(scheduled_rq_job) current_rq_job.retries_left = 1 setattr(current_rq_job, 'retry', _patched_retry) @@ -244,7 +253,7 @@ def export( current_rq_job.meta["lock_key"] = make_export_cache_lock_key(output_path) current_rq_job.save_meta() - # It make sense to acquire a lock if real export process is running and we are writing data into a file + # Acquiring a lock makes sense if a real export process is running and we are writing data into a file # If such a file exists, we can just return the path without acquiring a lock if osp.exists(output_path): return output_path @@ -295,6 +304,7 @@ def export( file_ctime=instance_update_time.timestamp(), logger=logger, ) + _patch_scheduled_job_status(cleaning_job) logger.info( "The {} '{}' is exported as '{}' at '{}' " "and available for downloading for the next {}. " @@ -322,23 +332,23 @@ def export( log_exception(logger) raise -def export_job_annotations(job_id, dst_format=None, server_url=None): - return export(dst_format,job_id=job_id, server_url=server_url, save_images=False) +def export_job_annotations(job_id: int, dst_format: str, *, server_url: str | None = None): + return export(dst_format=dst_format, job_id=job_id, server_url=server_url, save_images=False) -def export_job_as_dataset(job_id, dst_format=None, server_url=None): - return export(dst_format, job_id=job_id, server_url=server_url, save_images=True) +def export_job_as_dataset(job_id: int, dst_format: str, *, server_url: str | None = None): + return export(dst_format=dst_format, job_id=job_id, server_url=server_url, save_images=True) -def export_task_as_dataset(task_id, dst_format=None, server_url=None): - return export(dst_format, task_id=task_id, server_url=server_url, save_images=True) +def export_task_as_dataset(task_id: int, dst_format: str, *, server_url: str | None = None): + return export(dst_format=dst_format, task_id=task_id, server_url=server_url, save_images=True) -def export_task_annotations(task_id, dst_format=None, server_url=None): - return export(dst_format,task_id=task_id, server_url=server_url, save_images=False) +def export_task_annotations(task_id: int, dst_format: str, *, server_url: str | None = None): + return export(dst_format=dst_format, task_id=task_id, server_url=server_url, save_images=False) -def export_project_as_dataset(project_id, dst_format=None, server_url=None): - return export(dst_format, project_id=project_id, server_url=server_url, save_images=True) +def export_project_as_dataset(project_id: int, dst_format: str, *, server_url: str | None = None): + return export(dst_format=dst_format, project_id=project_id, server_url=server_url, save_images=True) -def export_project_annotations(project_id, dst_format=None, server_url=None): - return export(dst_format, project_id=project_id, server_url=server_url, save_images=False) +def export_project_annotations(project_id: int, dst_format: str, *, server_url: str | None = None): + return export(dst_format=dst_format, project_id=project_id, server_url=server_url, save_images=False) class FileIsBeingUsedError(Exception): diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 441d4702014d..b6274a34fa71 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -14,7 +14,7 @@ from django.conf import settings from django.http.response import HttpResponseBadRequest from django.utils import timezone -from django_rq.queues import DjangoRQ +from django_rq.queues import DjangoRQ, DjangoScheduler from rest_framework import serializers, status from rest_framework.request import Request from rest_framework.response import Response @@ -89,7 +89,7 @@ def setup_background_job(self, queue: DjangoRQ, rq_id: str) -> None: def _handle_rq_job_v1(self, rq_job: Optional[RQJob], queue: DjangoRQ) -> Optional[Response]: pass - def _handle_rq_job_v2(self, rq_job: Optional[RQJob], *args, **kwargs) -> Optional[Response]: + def _handle_rq_job_v2(self, rq_job: Optional[RQJob], queue: DjangoRQ) -> Optional[Response]: if not rq_job: return None @@ -101,17 +101,23 @@ def _handle_rq_job_v2(self, rq_job: Optional[RQJob], *args, **kwargs) -> Optiona status=status.HTTP_409_CONFLICT, ) - if rq_job_status in (RQJobStatus.SCHEDULED, RQJobStatus.DEFERRED): + if rq_job_status == RQJobStatus.DEFERRED: + rq_job.cancel(enqueue_dependents=settings.ONE_RUNNING_JOB_IN_QUEUE_PER_USER) + + if rq_job_status == RQJobStatus.SCHEDULED: + scheduler: DjangoScheduler = django_rq.get_scheduler(queue.name, queue=queue) + # remove the job id from the set with scheduled keys + scheduler.cancel(rq_job) rq_job.cancel(enqueue_dependents=settings.ONE_RUNNING_JOB_IN_QUEUE_PER_USER) rq_job.delete() return None - def handle_rq_job(self, *args, **kwargs) -> Optional[Response]: + def handle_rq_job(self, rq_job: RQJob, queue: DjangoRQ) -> Optional[Response]: if self.version == 1: - return self._handle_rq_job_v1(*args, **kwargs) + return self._handle_rq_job_v1(rq_job, queue) elif self.version == 2: - return self._handle_rq_job_v2(*args, **kwargs) + return self._handle_rq_job_v2(rq_job, queue) raise ValueError("Unsupported version") @@ -422,7 +428,7 @@ def setup_background_job( user_id = self.request.user.id func = self.export_callback - func_args = (self.db_instance.id, self.export_args.format, server_address) + func_args = (self.db_instance.id, self.export_args.format) result_url = None if self.export_args.location == Location.CLOUD_STORAGE: @@ -467,6 +473,9 @@ def setup_background_job( queue.enqueue_call( func=func, args=func_args, + kwargs={ + "server_url": server_address, + }, job_id=rq_id, meta=get_rq_job_meta( request=self.request, db_obj=self.db_instance, result_url=result_url From 26fce865760e95a4221abd916cd5e7bb6da362ca Mon Sep 17 00:00:00 2001 From: maya Date: Tue, 26 Nov 2024 18:30:50 +0100 Subject: [PATCH 04/32] Format code --- cvat/apps/dataset_manager/views.py | 6 ++---- cvat/apps/engine/background.py | 2 +- cvat/rqworker.py | 12 ++++++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 5a28018f0324..07c7a31bb184 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -32,11 +32,9 @@ ) from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import from pottery.redlock import Redlock -from pottery.exceptions import TooManyExtensions -from typing import Any, Callable +from typing import Callable from time import sleep -import signal slogger = ServerLogManager(__name__) @@ -72,7 +70,7 @@ def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: def _patch_scheduled_job_status(job: rq.job.Job): # NOTE: rq scheduler < 0.14 does not set the appropriate - # job status SCHEDULED. It has been fixed in the 0.14 version + # job status (SCHEDULED). This has been fixed in the 0.14 version. # https://github.com/rq/rq-scheduler/blob/f7d5787c5f94b5517e209c612ef648f4bfc44f9e/rq_scheduler/scheduler.py#L148 # FUTURE-TODO: delete manual status setting after upgrading to 0.14 if job.get_status(refresh=False) != rq.job.JobStatus.SCHEDULED: diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index b6274a34fa71..64f9a8746791 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -113,7 +113,7 @@ def _handle_rq_job_v2(self, rq_job: Optional[RQJob], queue: DjangoRQ) -> Optiona rq_job.delete() return None - def handle_rq_job(self, rq_job: RQJob, queue: DjangoRQ) -> Optional[Response]: + def handle_rq_job(self, rq_job: RQJob | None, queue: DjangoRQ) -> Optional[Response]: if self.version == 1: return self._handle_rq_job_v1(rq_job, queue) elif self.version == 2: diff --git a/cvat/rqworker.py b/cvat/rqworker.py index 548dd9a4688a..55027c3f7b44 100644 --- a/cvat/rqworker.py +++ b/cvat/rqworker.py @@ -4,11 +4,11 @@ # SPDX-License-Identifier: MIT import os +import signal +from resource import struct_rusage from rq import Worker from rq.job import Job -from resource import struct_rusage -import signal import cvat.utils.remote_debugger as debug @@ -23,21 +23,25 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): pass + def handle_work_horse_killed(job: Job, retpid: int, ret_val: int, rusage: struct_rusage): - print('IMPORTANT: handle_work_horse_killed') + print("IMPORTANT: handle_work_horse_killed") job.refresh() - if ret_val in (signal.SIGTERM, signal.SIGINT) and (lock_key := job.meta.get('lock_key')): + if ret_val in (signal.SIGTERM, signal.SIGINT) and (lock_key := job.meta.get("lock_key")): # todo: check lock ttl job.connection.delete(lock_key) + class WorkerWithCustomHandler(Worker): # https://github.com/rq/django-rq/issues/579 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._work_horse_killed_handler = handle_work_horse_killed + DefaultWorker = WorkerWithCustomHandler + class SimpleWorker(Worker): """ Allows to work with at most 1 worker thread. Useful for debugging. From 7f80ff1927adde6c9200d1eaf2051a6b2aca8c98 Mon Sep 17 00:00:00 2001 From: maya Date: Thu, 28 Nov 2024 12:49:44 +0100 Subject: [PATCH 05/32] Drop SIGTERM/SIGINT handling --- cvat/rqworker.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/cvat/rqworker.py b/cvat/rqworker.py index 55027c3f7b44..8a3e187b74b0 100644 --- a/cvat/rqworker.py +++ b/cvat/rqworker.py @@ -4,15 +4,15 @@ # SPDX-License-Identifier: MIT import os -import signal -from resource import struct_rusage from rq import Worker -from rq.job import Job import cvat.utils.remote_debugger as debug +DefaultWorker = Worker + + class BaseDeathPenalty: def __init__(self, timeout, exception, **kwargs): pass @@ -24,24 +24,6 @@ def __exit__(self, exc_type, exc_value, traceback): pass -def handle_work_horse_killed(job: Job, retpid: int, ret_val: int, rusage: struct_rusage): - print("IMPORTANT: handle_work_horse_killed") - job.refresh() - if ret_val in (signal.SIGTERM, signal.SIGINT) and (lock_key := job.meta.get("lock_key")): - # todo: check lock ttl - job.connection.delete(lock_key) - - -class WorkerWithCustomHandler(Worker): - # https://github.com/rq/django-rq/issues/579 - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self._work_horse_killed_handler = handle_work_horse_killed - - -DefaultWorker = WorkerWithCustomHandler - - class SimpleWorker(Worker): """ Allows to work with at most 1 worker thread. Useful for debugging. From e1c9de1551c486cb6ebaa27cfd319af33d076130 Mon Sep 17 00:00:00 2001 From: maya Date: Thu, 28 Nov 2024 12:50:36 +0100 Subject: [PATCH 06/32] Update default settings --- cvat/apps/dataset_manager/default_settings.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index 73e9d94e7457..ec1f99bfa417 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -8,6 +8,15 @@ DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 24)) "Base lifetime for cached exported datasets, in seconds" +default_dataset_export_lock_ttl = 60 * 5 +DATASET_EXPORT_LOCK_TTL = int(os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL", default_dataset_export_lock_ttl)) +""" +Default lifetime for the export cache lock, in seconds. +This value should be short enough to minimize the waiting time until the lock is automatically released +(e.g., in cases where a worker process is killed by the OOM killer and the lock is not released). +The lock will be automatically extended as needed for the duration of the worker process. +""" + DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") "Timeout for cache lock acquiring, in seconds" @@ -17,9 +26,13 @@ "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " "use DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT instead", DeprecationWarning) else: + default_dataset_lock_acquire_timeout = default_dataset_export_lock_ttl + 5 + """ + Set default lock acquire timeout to the default lock lifetime + small buffer + to handle possible cases when a lock wasn't released by the worker process + and will be released automatically by Redis + """ DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = int(os.getenv("DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT", 10)) -DATASET_EXPORT_LOCK_TTL = int(os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL", 60 * 5)) - DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) "Retry interval for cases the export cache lock was unavailable, in seconds" From 607a839faaa24a3c75758e9f13d6253fbb68b9f6 Mon Sep 17 00:00:00 2001 From: maya Date: Thu, 28 Nov 2024 13:29:04 +0100 Subject: [PATCH 07/32] Refactor code --- cvat/apps/dataset_manager/default_settings.py | 2 +- cvat/apps/dataset_manager/util.py | 2 +- cvat/apps/dataset_manager/views.py | 52 ++++++++----------- 3 files changed, 25 insertions(+), 31 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index ec1f99bfa417..f1e7a2636ad4 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -32,7 +32,7 @@ to handle possible cases when a lock wasn't released by the worker process and will be released automatically by Redis """ - DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = int(os.getenv("DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT", 10)) + DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = int(os.getenv("DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT", default_dataset_lock_acquire_timeout)) DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) "Retry interval for cases the export cache lock was unavailable, in seconds" diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 0d0b7c1478c5..fa1effeb6f8f 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -99,7 +99,7 @@ def faster_deepcopy(v): class LockNotAvailableError(Exception): pass -class ProlongLockError(Exception): +class ExtendLockError(Exception): pass diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 07c7a31bb184..b80a6b747932 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -25,10 +25,10 @@ from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS from .util import ( - LockNotAvailableError, ProlongLockError, + LockNotAvailableError, ExtendLockError, current_function_name, get_export_cache_lock, get_export_cache_dir, make_export_filename, - parse_export_file_path, make_export_cache_lock_key + parse_export_file_path ) from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import from pottery.redlock import Redlock @@ -39,7 +39,8 @@ slogger = ServerLogManager(__name__) _MODULE_NAME = __package__ + '.' + osp.splitext(osp.basename(__file__))[0] -def log_exception(logger=None, exc_info=True): + +def log_exception(logger: logging.Logger | None = None, exc_info: bool = True): if logger is None: logger = slogger logger.exception("[%s @ %s]: exception occurred" % \ @@ -59,7 +60,8 @@ def log_exception(logger=None, exc_info=True): EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT) EXPORT_LOCKED_RETRY_INTERVAL = timedelta(seconds=settings.DATASET_EXPORT_LOCKED_RETRY_INTERVAL) EXPORT_LOCK_TTL = settings.DATASET_EXPORT_LOCK_TTL -EXPORT_LOCK_EXTEND_INTERVAL = EXPORT_LOCK_TTL - 2 # todo: move into const +# prevent lock auto releasing when extending a lock by setting a slightly lower value +EXPORT_LOCK_EXTEND_INTERVAL = EXPORT_LOCK_TTL - 2 def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: @@ -114,34 +116,32 @@ def _patched_retry(*_1, **_2): return current_rq_job -class ProlongLockThread(threading.Thread): +class ExtendLockThread(threading.Thread): def __init__( self, *, lock: Redlock, lock_extend_interval: int, stop_event: threading.Event, - **kwargs, ): + super().__init__(target=self._extend_lock) + self.lock = lock self.lock_extend_interval = lock_extend_interval self.cur_sleep_interval = lock_extend_interval self.stop_event = stop_event self.logger = ServerLogManager(__name__) self.max_retry_attempt_count = 3 - super().__init__(**kwargs, target=self._prolong_lock) def _reset(self): self.cur_sleep_interval = self.lock_extend_interval - def _prolong_lock(self): + def _extend_lock(self): """ - Prolong the lock's TTL every seconds until is set. + Extend the lock's TTL every seconds until is set. The stop event is checked every second to minimize waiting time when the export process is completed. """ - self.logger.glob.debug("The prolong_lock is called") - while not self.stop_event.is_set(): sleep(1) self.cur_sleep_interval -= 1 @@ -150,11 +150,11 @@ def _prolong_lock(self): continue self.logger.glob.debug( - f"Extend lock {self.lock.key}, number of remaining extensions: {self.lock.num_extensions - self.lock._extension_num}" + f"Extend lock {self.lock.key}, number of remaining extensions: " + f"{self.lock.num_extensions - self.lock._extension_num}" ) for attempt_number in range(1, self.max_retry_attempt_count + 1): try: - # raise Exception('Ooops') self.lock.extend() self._reset() except Exception as ex: @@ -179,6 +179,7 @@ def __init__( save_images: bool, dst_format: str, ): + super().__init__(target=self._export_dataset) self.cache_dir = cache_dir self.output_path = output_path self.instance_id = instance_id @@ -186,10 +187,8 @@ def __init__( self.server_url = server_url self.save_images = save_images self.dst_format = dst_format - super().__init__(target=self._export_dataset) def _export_dataset(self): - # sleep(10) with tempfile.TemporaryDirectory(dir=self.cache_dir) as temp_dir: temp_file = osp.join(temp_dir, "result") self.export_fn( @@ -247,15 +246,6 @@ def export( os.makedirs(cache_dir, exist_ok=True) - current_rq_job = rq.get_current_job() - current_rq_job.meta["lock_key"] = make_export_cache_lock_key(output_path) - current_rq_job.save_meta() - - # Acquiring a lock makes sense if a real export process is running and we are writing data into a file - # If such a file exists, we can just return the path without acquiring a lock - if osp.exists(output_path): - return output_path - stop_event = threading.Event() with get_export_cache_lock( @@ -265,15 +255,19 @@ def export( ttl=EXPORT_LOCK_TTL, num_extensions=math.ceil(rq.get_current_job().timeout / EXPORT_LOCK_EXTEND_INTERVAL), ) as red_lock: - extend_lock_thread = ProlongLockThread( + if osp.exists(output_path): + # Update last update time to prolong the export lifetime + # and postpone the file deleting by the cleaning job + os.utime(output_path, None) + return output_path + + extend_lock_thread = ExtendLockThread( lock=red_lock, lock_extend_interval=EXPORT_LOCK_EXTEND_INTERVAL, stop_event=stop_event, ) extend_lock_thread.start() - print(f"current pid: {os.getpid()}") - export_thread = ExportThread( cache_dir, output_path, @@ -287,8 +281,8 @@ def export( while export_thread.is_alive(): if stop_event.is_set(): - raise ProlongLockError("Export aborted because the lock extension failed.") - sleep(5) # todo: move into const + raise ExtendLockError("Export aborted because the lock extension failed.") + sleep(5) export_thread.join() stop_event.set() From a6b6f0fedace01f6e1404cd44a3d75cb46d09cfc Mon Sep 17 00:00:00 2001 From: maya Date: Thu, 28 Nov 2024 13:29:39 +0100 Subject: [PATCH 08/32] [REST API tests] Increase waiting timeout --- tests/python/rest_api/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/python/rest_api/utils.py b/tests/python/rest_api/utils.py index 434c3705ddc3..dac0b408ff80 100644 --- a/tests/python/rest_api/utils.py +++ b/tests/python/rest_api/utils.py @@ -44,7 +44,7 @@ def initialize_export(endpoint: Endpoint, *, expect_forbidden: bool = False, **k def wait_and_download_v1( endpoint: Endpoint, *, - max_retries: int = 30, + max_retries: int = 100, interval: float = 0.1, download_result: bool = True, **kwargs, @@ -75,7 +75,7 @@ def wait_and_download_v1( def export_v1( endpoint: Endpoint, *, - max_retries: int = 30, + max_retries: int = 100, interval: float = 0.1, expect_forbidden: bool = False, wait_result: bool = True, @@ -115,7 +115,7 @@ def wait_and_download_v2( api_client: ApiClient, rq_id: str, *, - max_retries: int = 30, + max_retries: int = 100, interval: float = 0.1, download_result: bool = True, ) -> Optional[bytes]: @@ -153,7 +153,7 @@ def wait_and_download_v2( def export_v2( endpoint: Endpoint, *, - max_retries: int = 30, + max_retries: int = 100, interval: float = 0.1, expect_forbidden: bool = False, wait_result: bool = True, From c61a74b05153268666ecbfa8ddd09011ed471a31 Mon Sep 17 00:00:00 2001 From: maya Date: Thu, 28 Nov 2024 13:30:06 +0100 Subject: [PATCH 09/32] t --- cvat/apps/dataset_manager/default_settings.py | 11 ++++++++++- cvat/apps/dataset_manager/views.py | 4 ++-- cvat/rqworker.py | 1 - tests/python/rest_api/utils.py | 16 ++++++++-------- 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index f1e7a2636ad4..e441a8e7fc01 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -4,11 +4,14 @@ import os import warnings +from django.core.exceptions import ImproperlyConfigured + DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 24)) "Base lifetime for cached exported datasets, in seconds" default_dataset_export_lock_ttl = 60 * 5 + DATASET_EXPORT_LOCK_TTL = int(os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL", default_dataset_export_lock_ttl)) """ Default lifetime for the export cache lock, in seconds. @@ -17,6 +20,12 @@ The lock will be automatically extended as needed for the duration of the worker process. """ +# prevent lock auto releasing when extending a lock by setting a slightly lower value +DATASET_EXPORT_LOCK_EXTEND_INTERVAL = DATASET_EXPORT_LOCK_TTL - 10 + +if DATASET_EXPORT_LOCK_EXTEND_INTERVAL < 5: + raise ImproperlyConfigured("Recheck value of DATASET_EXPORT_LOCK_TTL") + DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") "Timeout for cache lock acquiring, in seconds" @@ -26,7 +35,7 @@ "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " "use DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT instead", DeprecationWarning) else: - default_dataset_lock_acquire_timeout = default_dataset_export_lock_ttl + 5 + default_dataset_lock_acquire_timeout = default_dataset_export_lock_ttl + 30 """ Set default lock acquire timeout to the default lock lifetime + small buffer to handle possible cases when a lock wasn't released by the worker process diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index b80a6b747932..779faf06334a 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -60,8 +60,8 @@ def log_exception(logger: logging.Logger | None = None, exc_info: bool = True): EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT) EXPORT_LOCKED_RETRY_INTERVAL = timedelta(seconds=settings.DATASET_EXPORT_LOCKED_RETRY_INTERVAL) EXPORT_LOCK_TTL = settings.DATASET_EXPORT_LOCK_TTL -# prevent lock auto releasing when extending a lock by setting a slightly lower value -EXPORT_LOCK_EXTEND_INTERVAL = EXPORT_LOCK_TTL - 2 + +EXPORT_LOCK_EXTEND_INTERVAL = settings.DATASET_EXPORT_LOCK_EXTEND_INTERVAL def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: diff --git a/cvat/rqworker.py b/cvat/rqworker.py index 8a3e187b74b0..309c4d5ce714 100644 --- a/cvat/rqworker.py +++ b/cvat/rqworker.py @@ -9,7 +9,6 @@ import cvat.utils.remote_debugger as debug - DefaultWorker = Worker diff --git a/tests/python/rest_api/utils.py b/tests/python/rest_api/utils.py index dac0b408ff80..d6bb156b0f8e 100644 --- a/tests/python/rest_api/utils.py +++ b/tests/python/rest_api/utils.py @@ -44,7 +44,7 @@ def initialize_export(endpoint: Endpoint, *, expect_forbidden: bool = False, **k def wait_and_download_v1( endpoint: Endpoint, *, - max_retries: int = 100, + max_retries: int = 50, interval: float = 0.1, download_result: bool = True, **kwargs, @@ -75,7 +75,7 @@ def wait_and_download_v1( def export_v1( endpoint: Endpoint, *, - max_retries: int = 100, + max_retries: int = 50, interval: float = 0.1, expect_forbidden: bool = False, wait_result: bool = True, @@ -115,7 +115,7 @@ def wait_and_download_v2( api_client: ApiClient, rq_id: str, *, - max_retries: int = 100, + max_retries: int = 50, interval: float = 0.1, download_result: bool = True, ) -> Optional[bytes]: @@ -153,7 +153,7 @@ def wait_and_download_v2( def export_v2( endpoint: Endpoint, *, - max_retries: int = 100, + max_retries: int = 50, interval: float = 0.1, expect_forbidden: bool = False, wait_result: bool = True, @@ -196,7 +196,7 @@ def export_dataset( ], # make this parameter required to be sure that all tests was updated and both API versions are used *, save_images: bool, - max_retries: int = 30, + max_retries: int = 50, interval: float = 0.1, format: str = "CVAT for images 1.1", # pylint: disable=redefined-builtin **kwargs, @@ -278,7 +278,7 @@ def export_backup( int, tuple[int] ], # make this parameter required to be sure that all tests was updated and both API versions are used *, - max_retries: int = 30, + max_retries: int = 50, interval: float = 0.1, **kwargs, ) -> Optional[bytes]: @@ -326,7 +326,7 @@ def export_task_backup( def import_resource( endpoint: Endpoint, *, - max_retries: int = 30, + max_retries: int = 50, interval: float = 0.1, expect_forbidden: bool = False, wait_result: bool = True, @@ -372,7 +372,7 @@ def import_resource( def import_backup( api: Union[ProjectsApi, TasksApi], *, - max_retries: int = 30, + max_retries: int = 50, interval: float = 0.1, **kwargs, ) -> None: From 80af002cfe81f26d6a9ff1faf5602439a56cc230 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Tue, 3 Dec 2024 13:56:13 +0100 Subject: [PATCH 10/32] Switch to short locks --- cvat/apps/dataset_manager/default_settings.py | 27 +--- cvat/apps/dataset_manager/util.py | 11 +- cvat/apps/dataset_manager/views.py | 144 +++--------------- cvat/apps/engine/background.py | 14 +- 4 files changed, 37 insertions(+), 159 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index e441a8e7fc01..2218a220d8b0 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -10,21 +10,10 @@ DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 24)) "Base lifetime for cached exported datasets, in seconds" -default_dataset_export_lock_ttl = 60 * 5 +default_dataset_export_lock_ttl = 30 DATASET_EXPORT_LOCK_TTL = int(os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL", default_dataset_export_lock_ttl)) -""" -Default lifetime for the export cache lock, in seconds. -This value should be short enough to minimize the waiting time until the lock is automatically released -(e.g., in cases where a worker process is killed by the OOM killer and the lock is not released). -The lock will be automatically extended as needed for the duration of the worker process. -""" - -# prevent lock auto releasing when extending a lock by setting a slightly lower value -DATASET_EXPORT_LOCK_EXTEND_INTERVAL = DATASET_EXPORT_LOCK_TTL - 10 - -if DATASET_EXPORT_LOCK_EXTEND_INTERVAL < 5: - raise ImproperlyConfigured("Recheck value of DATASET_EXPORT_LOCK_TTL") +"Default lifetime for the export cache lock, in seconds." DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") "Timeout for cache lock acquiring, in seconds" @@ -35,13 +24,13 @@ "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " "use DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT instead", DeprecationWarning) else: - default_dataset_lock_acquire_timeout = default_dataset_export_lock_ttl + 30 - """ - Set default lock acquire timeout to the default lock lifetime + small buffer - to handle possible cases when a lock wasn't released by the worker process - and will be released automatically by Redis - """ + default_dataset_lock_acquire_timeout = 2 * default_dataset_export_lock_ttl DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = int(os.getenv("DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT", default_dataset_lock_acquire_timeout)) +if DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT <= DATASET_EXPORT_LOCK_TTL: + raise ImproperlyConfigured( + "Lock acquire timeout must be more than lock TTL" + ) + DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) "Retry interval for cases the export cache lock was unavailable, in seconds" diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index fa1effeb6f8f..b813454c08a7 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -99,9 +99,6 @@ def faster_deepcopy(v): class LockNotAvailableError(Exception): pass -class ExtendLockError(Exception): - pass - def make_export_cache_lock_key(filename: os.PathLike[str]) -> str: return f"export_lock:{os.fspath(filename)}" @@ -113,8 +110,8 @@ def get_export_cache_lock( *, ttl: int | timedelta, block: bool = True, - acquire_timeout: Optional[int | timedelta] = None, - num_extensions: int | None = None, + # endless waiting for the lock should be avoided at least by default + acquire_timeout: int | timedelta | None, ) -> Generator[Lock, Any, Any]: if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() @@ -128,9 +125,6 @@ def get_export_cache_lock( if not ttl or ttl < 0: raise ValueError("ttl must be a non-negative number") - if num_extensions and num_extensions < 0: - raise ValueError("num_extensions must be a non-negative number") - # https://redis.io/docs/latest/develop/use/patterns/distributed-locks/ # The lock is exclusive, so it may potentially reduce performance in some cases, # where parallel access is potentially possible and valid, @@ -139,7 +133,6 @@ def get_export_cache_lock( key=make_export_cache_lock_key(export_path), masters={django_rq.get_connection(settings.CVAT_QUEUES.EXPORT_DATA.value)}, auto_release_time=ttl, - **({"num_extensions": num_extensions} if num_extensions is not None else {}), ) acquired = lock.acquire(blocking=block, timeout=acquire_timeout) try: diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 779faf06334a..2d735d2f75dd 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -8,11 +8,9 @@ import os.path as osp import tempfile from datetime import timedelta -import math import django_rq import rq -import threading from django.conf import settings from django.utils import timezone from rq_scheduler import Scheduler @@ -25,16 +23,13 @@ from .formats.registry import EXPORT_FORMATS, IMPORT_FORMATS from .util import ( - LockNotAvailableError, ExtendLockError, + LockNotAvailableError, current_function_name, get_export_cache_lock, get_export_cache_dir, make_export_filename, parse_export_file_path ) from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import -from pottery.redlock import Redlock -from typing import Callable -from time import sleep slogger = ServerLogManager(__name__) @@ -59,9 +54,7 @@ def log_exception(logger: logging.Logger | None = None, exc_info: bool = True): EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT) EXPORT_LOCKED_RETRY_INTERVAL = timedelta(seconds=settings.DATASET_EXPORT_LOCKED_RETRY_INTERVAL) -EXPORT_LOCK_TTL = settings.DATASET_EXPORT_LOCK_TTL - -EXPORT_LOCK_EXTEND_INTERVAL = settings.DATASET_EXPORT_LOCK_EXTEND_INTERVAL +EXPORT_LOCK_TTL = timedelta(seconds=settings.DATASET_EXPORT_LOCK_TTL) def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: @@ -115,92 +108,6 @@ def _patched_retry(*_1, **_2): setattr(current_rq_job, 'retry', _patched_retry) return current_rq_job - -class ExtendLockThread(threading.Thread): - def __init__( - self, - *, - lock: Redlock, - lock_extend_interval: int, - stop_event: threading.Event, - ): - super().__init__(target=self._extend_lock) - - self.lock = lock - self.lock_extend_interval = lock_extend_interval - self.cur_sleep_interval = lock_extend_interval - self.stop_event = stop_event - self.logger = ServerLogManager(__name__) - self.max_retry_attempt_count = 3 - - def _reset(self): - self.cur_sleep_interval = self.lock_extend_interval - - def _extend_lock(self): - """ - Extend the lock's TTL every seconds until is set. - The stop event is checked every second to minimize waiting time when the export process is completed. - """ - - while not self.stop_event.is_set(): - sleep(1) - self.cur_sleep_interval -= 1 - - if self.cur_sleep_interval: - continue - - self.logger.glob.debug( - f"Extend lock {self.lock.key}, number of remaining extensions: " - f"{self.lock.num_extensions - self.lock._extension_num}" - ) - for attempt_number in range(1, self.max_retry_attempt_count + 1): - try: - self.lock.extend() - self._reset() - except Exception as ex: - self.logger.glob.exception( - f"Attempt number: {attempt_number}, " - f"an exception occurred during lock {self.lock.key} extension: ", - str(ex), - ) - if attempt_number == self.max_retry_attempt_count: - self.stop_event.set() - return - - -class ExportThread(threading.Thread): - def __init__( - self, - cache_dir: str, - output_path: str, - instance_id: int, - export_fn: Callable[..., None], - server_url: str | None, - save_images: bool, - dst_format: str, - ): - super().__init__(target=self._export_dataset) - self.cache_dir = cache_dir - self.output_path = output_path - self.instance_id = instance_id - self.export_fn = export_fn - self.server_url = server_url - self.save_images = save_images - self.dst_format = dst_format - - def _export_dataset(self): - with tempfile.TemporaryDirectory(dir=self.cache_dir) as temp_dir: - temp_file = osp.join(temp_dir, "result") - self.export_fn( - self.instance_id, - temp_file, - self.dst_format, - server_url=self.server_url, - save_images=self.save_images, - ) - os.replace(temp_file, self.output_path) - - def export( *, dst_format: str, @@ -246,47 +153,30 @@ def export( os.makedirs(cache_dir, exist_ok=True) - stop_event = threading.Event() - + # acquire a lock 2 times instead of using one long lock: + # 1. to check whether the file exists or not + # 2. to create a file when it doesn't exist with get_export_cache_lock( output_path, - block=True, - acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, ttl=EXPORT_LOCK_TTL, - num_extensions=math.ceil(rq.get_current_job().timeout / EXPORT_LOCK_EXTEND_INTERVAL), - ) as red_lock: + acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, + ): if osp.exists(output_path): # Update last update time to prolong the export lifetime # and postpone the file deleting by the cleaning job os.utime(output_path, None) return output_path - extend_lock_thread = ExtendLockThread( - lock=red_lock, - lock_extend_interval=EXPORT_LOCK_EXTEND_INTERVAL, - stop_event=stop_event, - ) - extend_lock_thread.start() - - export_thread = ExportThread( - cache_dir, + with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: + temp_file = osp.join(temp_dir, 'result') + export_fn(db_instance.id, temp_file, dst_format, + server_url=server_url, save_images=save_images) + with get_export_cache_lock( output_path, - db_instance.id, - export_fn, - server_url, - save_images, - dst_format, - ) - export_thread.start() - - while export_thread.is_alive(): - if stop_event.is_set(): - raise ExtendLockError("Export aborted because the lock extension failed.") - sleep(5) - - export_thread.join() - stop_event.set() - extend_lock_thread.join() + ttl=EXPORT_LOCK_TTL, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, + ): + os.replace(temp_file, output_path) scheduler: Scheduler = django_rq.get_scheduler(settings.CVAT_QUEUES.EXPORT_DATA.value) cleaning_job = scheduler.enqueue_in( @@ -354,7 +244,7 @@ def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger file_path, block=True, acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, - ttl=rq.get_current_job().timeout, + ttl=EXPORT_LOCK_TTL, ): if not osp.exists(file_path): raise FileNotFoundError("Export cache file '{}' doesn't exist".format(file_path)) diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 64f9a8746791..7cc494518db7 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -52,6 +52,10 @@ slogger = ServerLogManager(__name__) +REQUEST_TIMEOUT = 60 +# it's better to return LockNotAvailableError instead of 502 error TODO: (or 504?) +LOCK_ACQUIRE_TIMEOUT = REQUEST_TIMEOUT - 2 +LOCK_TTL = REQUEST_TIMEOUT - 5 class _ResourceExportManager(ABC): QUEUE_NAME = settings.CVAT_QUEUES.EXPORT_DATA.value @@ -226,7 +230,7 @@ def is_result_outdated() -> bool: return rq_job.meta[RQJobMetaField.REQUEST]["timestamp"] < instance_update_time def handle_local_download() -> Response: - with dm.util.get_export_cache_lock(file_path, ttl=REQUEST_TIMEOUT): + with dm.util.get_export_cache_lock(file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT): if not osp.exists(file_path): return Response( "The exported file has expired, please retry exporting", @@ -301,8 +305,6 @@ def handle_local_download() -> Response: instance_update_time = self.get_instance_update_time() instance_timestamp = self.get_timestamp(instance_update_time) - REQUEST_TIMEOUT = 60 - if rq_job_status == RQJobStatus.FINISHED: if self.export_args.location == Location.CLOUD_STORAGE: rq_job.delete() @@ -319,7 +321,11 @@ def handle_local_download() -> Response: if action == "download": return handle_local_download() else: - with dm.util.get_export_cache_lock(file_path, ttl=REQUEST_TIMEOUT): + with dm.util.get_export_cache_lock( + file_path, + ttl=LOCK_TTL, + acquire_timeout=LOCK_ACQUIRE_TIMEOUT, + ): if osp.exists(file_path) and not is_result_outdated(): # Update last update time to prolong the export lifetime # as the last access time is not available on every filesystem From 1e2fe5fbb33444d04a97c6d7d7665b2f9309858b Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 5 Dec 2024 20:11:10 +0100 Subject: [PATCH 11/32] update unit tests --- .../tests/test_rest_api_formats.py | 556 ++++++++++-------- cvat/apps/dataset_manager/views.py | 3 +- 2 files changed, 325 insertions(+), 234 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 059a45f6df2d..20e981951aed 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1318,6 +1318,32 @@ def get(self) -> str: class _LockTimeoutError(Exception): pass + def setUp(self): + self.export_cache_lock = multiprocessing.Lock() + + @contextmanager + def patched_get_export_cache_lock(self, export_path, *, ttl, block=True, acquire_timeout=None): + # fakeredis lock acquired in a subprocess won't be visible to other processes + # just implement the lock here + from cvat.apps.dataset_manager.util import LockNotAvailableError + + if isinstance(acquire_timeout, timedelta): + acquire_timeout = acquire_timeout.total_seconds() + if acquire_timeout is None: + acquire_timeout = -1 + + acquired = self.export_cache_lock.acquire( + block=block, timeout=acquire_timeout if acquire_timeout > -1 else None + ) + + if not acquired: + raise LockNotAvailableError + + try: + yield + finally: + self.export_cache_lock.release() + @overload @classmethod def set_condition(cls, var: SharedBool, value: bool = True): ... @@ -1387,6 +1413,20 @@ def process_closing(process: multiprocessing.Process, *, timeout: Optional[int] process.join(timeout=timeout) process.close() + def _setup_task_with_annotations( + self, + *, + number_of_images: int = 3, + format_name: str | None = None, + name_ann: str | None = None, + ): + assert format_name or name_ann + images = self._generate_task_images(number_of_images) + task = self._create_task(tasks["main"], images) + self._create_annotations(task, name_ann or f"{format_name} many jobs", "default") + + return task + def test_concurrent_export_and_cleanup(self): side_effect = self.side_effect chain_side_effects = self.chain_side_effects @@ -1397,185 +1437,176 @@ def test_concurrent_export_and_cleanup(self): format_name = "CVAT for images 1.1" - export_cache_lock = multiprocessing.Lock() - export_checked_the_file = self.SharedBool() - export_created_the_file = self.SharedBool() - export_file_path = self.SharedString() - clear_removed_the_file = self.SharedBool() - - @contextmanager - def patched_get_export_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): - # fakeredis lock acquired in a subprocess won't be visible to other processes - # just implement the lock here - from cvat.apps.dataset_manager.util import LockNotAvailableError - - if isinstance(acquire_timeout, timedelta): - acquire_timeout = acquire_timeout.total_seconds() - if acquire_timeout is None: - acquire_timeout = -1 - - acquired = export_cache_lock.acquire( - block=block, - timeout=acquire_timeout if acquire_timeout > -1 else None - ) - - if not acquired: - raise LockNotAvailableError - - try: - yield - finally: - export_cache_lock.release() + clear_process_has_run = self.SharedBool() + export_outdated_after = timedelta(seconds=1) def _export(*_, task_id: int): - from os.path import exists as original_exists - from os import replace as original_replace - from cvat.apps.dataset_manager.views import log_exception as original_log_exception import sys + from os import replace as original_replace + from os.path import exists as original_exists - def os_replace_dst_recorder(_: str, dst: str): - set_condition(export_file_path, dst) - return MOCK_DEFAULT + from cvat.apps.dataset_manager.views import log_exception as original_log_exception def patched_log_exception(logger=None, exc_info=True): cur_exc_info = sys.exc_info() if exc_info is True else exc_info - if cur_exc_info and cur_exc_info[1] and isinstance(cur_exc_info[1], _LockTimeoutError): - return # don't spam in logs with expected errors + if ( + cur_exc_info + and cur_exc_info[1] + and isinstance(cur_exc_info[1], _LockTimeoutError) + ): + return # don't spam in logs with expected errors original_log_exception(logger, exc_info) with ( - patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), + patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=5), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT", new=10), patch( - 'cvat.apps.dataset_manager.views.get_export_cache_lock', - new=patched_get_export_cache_lock + "cvat.apps.dataset_manager.views.get_export_cache_lock", + new=self.patched_get_export_cache_lock, ), - patch('cvat.apps.dataset_manager.views.osp.exists') as mock_osp_exists, - patch('cvat.apps.dataset_manager.views.os.replace') as mock_os_replace, - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), - patch('cvat.apps.dataset_manager.views.log_exception', new=patched_log_exception), + # We need to mock the function directly imported into the module + # to ensure that the `export_checked_the_file` condition is set + # only after checking whether a file exists inside an acquired lock + patch("cvat.apps.dataset_manager.views.osp_exists") as mock_osp_exists, + patch( + "cvat.apps.dataset_manager.views.os.replace", side_effect=original_replace + ) as mock_os_replace, + patch( + "cvat.apps.dataset_manager.views.rq.get_current_job" + ) as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), + patch("cvat.apps.dataset_manager.views.log_exception", new=patched_log_exception), ): mock_osp_exists.side_effect = chain_side_effects( original_exists, side_effect(set_condition, export_checked_the_file), - ) - - mock_os_replace.side_effect = chain_side_effects( - original_replace, - os_replace_dst_recorder, - side_effect(set_condition, export_created_the_file), - side_effect(wait_condition, clear_removed_the_file), + side_effect(wait_condition, clear_process_has_run), + side_effect(sleep, 1), ) mock_rq_get_current_job.return_value = MagicMock(timeout=5) - exited_by_timeout = False - try: - export(dst_format=format_name, task_id=task_id) - except _LockTimeoutError: - # should come from waiting for clear_removed_the_file - exited_by_timeout = True - - assert exited_by_timeout - mock_os_replace.assert_called_once() - + export(dst_format=format_name, task_id=task_id) + mock_os_replace.assert_not_called() def _clear(*_, file_path: str, file_ctime: str): from os import remove as original_remove - from cvat.apps.dataset_manager.util import LockNotAvailableError + + from cvat.apps.dataset_manager.views import FileIsBeingUsedError with ( - patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), + patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=5), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT", new=10), + patch( + "cvat.apps.dataset_manager.views.get_export_cache_lock", + new=self.patched_get_export_cache_lock, + ), + patch( + "cvat.apps.dataset_manager.views.os.remove", side_effect=original_remove + ) as mock_os_remove, patch( - 'cvat.apps.dataset_manager.views.get_export_cache_lock', - new=patched_get_export_cache_lock + "cvat.apps.dataset_manager.views.rq.get_current_job" + ) as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), + patch( + "cvat.apps.dataset_manager.views.TTL_CONSTS", + new={"task": export_outdated_after}, ), - patch('cvat.apps.dataset_manager.views.os.remove') as mock_os_remove, - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), - patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), ): - mock_os_remove.side_effect = chain_side_effects( - side_effect(wait_condition, export_created_the_file), - original_remove, - side_effect(set_condition, clear_removed_the_file), - ) - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - exited_by_timeout = False + set_condition(clear_process_has_run) + file_is_been_used_error_raised = False try: clear_export_cache( file_path=file_path, file_ctime=file_ctime, logger=MagicMock() ) - except LockNotAvailableError: - # should come from waiting for get_export_cache_lock - exited_by_timeout = True - - assert exited_by_timeout + except FileIsBeingUsedError: + file_is_been_used_error_raised = True + assert file_is_been_used_error_raised + mock_os_remove.assert_not_called() # The problem checked is TOCTOU / race condition for file existence check and - # further file creation / removal. There are several possible variants of the problem. + # further file update / removal. There are several possible variants of the problem. # An example: - # 1. export checks the file exists, but outdated + # 1. export checks the file exists -> file is not outdated -> need to touch file's updated_date # 2. clear checks the file exists, and matches the creation timestamp - # 3. export creates the new export file - # 4. remove removes the new export file (instead of the one that it checked) + # 3. export updates the files's modification date and does not run actual export + # 4. remove removes the actual export file # Thus, we have no exported file after the successful export. - # + + # note: it is not possibel to achive a situation + # when clear process deletes newly "re-created by export process" + # file instead of the checked one since file names contain a timestamp. + # Other variants can be variations on the intermediate calls, such as getmtime: # - export: exists() # - clear: remove() # - export: getmtime() -> an exception + + # - clear_1: exists() + # - clear_2: remove() + # - clear_1: getmtime() -> an exception # etc. - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job + # create a file in the export cache first_export_path = export(dst_format=format_name, task_id=task_id) + self.assertTrue(osp.isfile(first_export_path)) - export_instance_timestamp = parse_export_file_path(first_export_path).instance_timestamp + initial_file_modfication_time = os.path.getmtime(first_export_path) + # make sure that a file in the export cache is outdated by timeout + # and a file would have to be deleted if the export was not running in parallel + sleep(export_outdated_after.seconds + 1) - self._create_annotations(task, f'{format_name} many jobs', "default") + export_instance_timestamp = parse_export_file_path(first_export_path).instance_timestamp processes_finished_correctly = False with ExitStack() as es: # Run both operations concurrently # Threads could be faster, but they can't be terminated - export_process = es.enter_context(process_closing(multiprocessing.Process( - target=_export, - args=( - export_cache_lock, - export_checked_the_file, export_created_the_file, - export_file_path, clear_removed_the_file, - ), - kwargs=dict(task_id=task_id), - ))) - clear_process = es.enter_context(process_closing(multiprocessing.Process( - target=_clear, - args=( - export_cache_lock, - export_checked_the_file, export_created_the_file, - export_file_path, clear_removed_the_file, - ), - kwargs=dict(file_path=first_export_path, file_ctime=export_instance_timestamp), - ))) + export_process = es.enter_context( + process_closing( + multiprocessing.Process( + target=_export, + args=( + self.export_cache_lock, + export_checked_the_file, + ), + kwargs=dict(task_id=task_id), + ) + ) + ) + clear_process = es.enter_context( + process_closing( + multiprocessing.Process( + target=_clear, + args=( + self.export_cache_lock, + export_checked_the_file, + ), + kwargs=dict( + file_path=first_export_path, file_ctime=export_instance_timestamp + ), + ) + ) + ) export_process.start() - wait_condition(export_checked_the_file) # ensure the expected execution order + wait_condition(export_checked_the_file) # ensure the expected execution order clear_process.start() # A deadlock (interrupted by a timeout error) is the positive outcome in this test, @@ -1598,17 +1629,11 @@ def _clear(*_, file_path: str, file_ctime: str): processes_finished_correctly = True self.assertTrue(processes_finished_correctly) + self.assertGreater(os.path.getmtime(first_export_path), initial_file_modfication_time) # terminate() may break the locks, don't try to acquire # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.Process.terminate self.assertTrue(export_checked_the_file.get()) - self.assertTrue(export_created_the_file.get()) - - self.assertFalse(clear_removed_the_file.get()) - - new_export_path = export_file_path.get() - self.assertGreater(len(new_export_path), 0) - self.assertTrue(osp.isfile(new_export_path)) def test_concurrent_download_and_cleanup(self): side_effect = self.side_effect @@ -1619,14 +1644,10 @@ def test_concurrent_download_and_cleanup(self): format_name = "CVAT for images 1.1" - export_cache_lock = multiprocessing.Lock() - download_checked_the_file = self.SharedBool() clear_removed_the_file = self.SharedBool() - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] download_url = self._generate_url_dump_tasks_annotations(task_id) @@ -1634,30 +1655,6 @@ def test_concurrent_download_and_cleanup(self): "format": format_name, } - @contextmanager - def patched_get_export_cache_lock(export_path, *, ttl, block=True, acquire_timeout=None): - # fakeredis lock acquired in a subprocess won't be visible to other processes - # just implement the lock here - from cvat.apps.dataset_manager.util import LockNotAvailableError - - if isinstance(acquire_timeout, timedelta): - acquire_timeout = acquire_timeout.total_seconds() - if acquire_timeout is None: - acquire_timeout = -1 - - acquired = export_cache_lock.acquire( - block=block, - timeout=acquire_timeout if acquire_timeout > -1 else None - ) - - if not acquired: - raise LockNotAvailableError - - try: - yield - finally: - export_cache_lock.release() - def _download(*_, task_id: int, export_path: str): from os.path import exists as original_exists @@ -1668,16 +1665,16 @@ def patched_osp_exists(path: str): set_condition(download_checked_the_file) wait_condition( clear_removed_the_file, timeout=20 - ) # wait more than the process timeout + ) # wait more than the process timeout return result with ( patch( - 'cvat.apps.engine.views.dm.util.get_export_cache_lock', - new=patched_get_export_cache_lock + "cvat.apps.engine.views.dm.util.get_export_cache_lock", + new=self.patched_get_export_cache_lock, ), - patch('cvat.apps.dataset_manager.views.osp.exists') as mock_osp_exists, + patch("cvat.apps.dataset_manager.views.osp.exists") as mock_osp_exists, TemporaryDirectory() as temp_dir, ): mock_osp_exists.side_effect = patched_osp_exists @@ -1691,18 +1688,23 @@ def patched_osp_exists(path: str): def _clear(*_, file_path: str, file_ctime: str): from os import remove as original_remove + from cvat.apps.dataset_manager.util import LockNotAvailableError with ( - patch('cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TIMEOUT', new=5), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT", new=3), patch( - 'cvat.apps.dataset_manager.views.get_export_cache_lock', - new=patched_get_export_cache_lock + "cvat.apps.dataset_manager.views.get_export_cache_lock", + new=self.patched_get_export_cache_lock, + ), + patch("cvat.apps.dataset_manager.views.os.remove") as mock_os_remove, + patch( + "cvat.apps.dataset_manager.views.rq.get_current_job" + ) as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), + patch( + "cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)} ), - patch('cvat.apps.dataset_manager.views.os.remove') as mock_os_remove, - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), - patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), ): mock_os_remove.side_effect = chain_side_effects( original_remove, @@ -1722,7 +1724,6 @@ def _clear(*_, file_path: str, file_ctime: str): assert exited_by_timeout - # The problem checked is TOCTOU / race condition for file existence check and # further file reading / removal. There are several possible variants of the problem. # An example: @@ -1748,7 +1749,7 @@ def patched_export(*args, **kwargs): return result - with patch('cvat.apps.dataset_manager.views.export', new=patched_export): + with patch("cvat.apps.dataset_manager.views.export", new=patched_export): response = self._get_request_with_data(download_url, download_params, self.admin) self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) @@ -1763,20 +1764,28 @@ def patched_export(*args, **kwargs): with ExitStack() as es: # Run both operations concurrently # Threads could be faster, but they can't be terminated - download_process = es.enter_context(process_closing(multiprocessing.Process( - target=_download, - args=(download_checked_the_file, clear_removed_the_file, export_cache_lock), - kwargs=dict(task_id=task_id, export_path=export_path), - ))) - clear_process = es.enter_context(process_closing(multiprocessing.Process( - target=_clear, - args=(download_checked_the_file, clear_removed_the_file, export_cache_lock), - kwargs=dict(file_path=export_path, file_ctime=export_instance_time), - ))) + download_process = es.enter_context( + process_closing( + multiprocessing.Process( + target=_download, + args=(download_checked_the_file, clear_removed_the_file), + kwargs=dict(task_id=task_id, export_path=export_path), + ) + ) + ) + clear_process = es.enter_context( + process_closing( + multiprocessing.Process( + target=_clear, + args=(download_checked_the_file, clear_removed_the_file), + kwargs=dict(file_path=export_path, file_ctime=export_instance_time), + ) + ) + ) download_process.start() - wait_condition(download_checked_the_file) # ensure the expected execution order + wait_condition(download_checked_the_file) # ensure the expected execution order clear_process.start() # A deadlock (interrupted by a timeout error) is the positive outcome in this test, @@ -1796,7 +1805,7 @@ def patched_export(*args, **kwargs): # All the expected exceptions should be handled in the process callbacks. # This is to avoid passing the test with unexpected errors - self.assertEqual(download_process.exitcode, -15) # sigterm + self.assertEqual(download_process.exitcode, -15) # sigterm self.assertEqual(clear_process.exitcode, 0) processes_finished_correctly = True @@ -1811,15 +1820,15 @@ def patched_export(*args, **kwargs): def test_export_can_create_file_and_cleanup_job(self): format_name = "CVAT for images 1.1" - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler') as mock_rq_get_scheduler, - patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch( + "cvat.apps.dataset_manager.views.django_rq.get_scheduler" + ) as mock_rq_get_scheduler, + patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)}), ): mock_rq_job = MagicMock(timeout=5) mock_rq_get_current_job.return_value = mock_rq_job @@ -1837,24 +1846,23 @@ def test_export_cache_lock_can_raise_on_releasing_expired_lock(self): with self.assertRaises(ReleaseUnlockedLock): lock_time = 2 - with get_export_cache_lock('test_export_path', ttl=lock_time, acquire_timeout=5): + with get_export_cache_lock("test_export_path", ttl=lock_time, acquire_timeout=5): sleep(lock_time + 1) def test_export_can_request_retry_on_locking_failure(self): format_name = "CVAT for images 1.1" - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] from cvat.apps.dataset_manager.util import LockNotAvailableError + with ( patch( - 'cvat.apps.dataset_manager.views.get_export_cache_lock', - side_effect=LockNotAvailableError + "cvat.apps.dataset_manager.views.get_export_cache_lock", + side_effect=LockNotAvailableError, ) as mock_get_export_cache_lock, - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), self.assertRaises(LockNotAvailableError), ): mock_rq_job = MagicMock(timeout=5) @@ -1867,25 +1875,26 @@ def test_export_can_request_retry_on_locking_failure(self): def test_export_can_reuse_older_file_if_still_relevant(self): format_name = "CVAT for images 1.1" - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) first_export_path = export(dst_format=format_name, task_id=task_id) from os.path import exists as original_exists + with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), - patch('cvat.apps.dataset_manager.views.osp.exists', side_effect=original_exists) as mock_osp_exists, - patch('cvat.apps.dataset_manager.views.os.replace') as mock_os_replace, + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), + patch( + "cvat.apps.dataset_manager.views.osp_exists", side_effect=original_exists + ) as mock_osp_exists, + patch("cvat.apps.dataset_manager.views.os.replace") as mock_os_replace, ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) @@ -1895,25 +1904,110 @@ def test_export_can_reuse_older_file_if_still_relevant(self): mock_osp_exists.assert_called_with(first_export_path) mock_os_replace.assert_not_called() + def test_initiate_concurrent_export_by_different_users(self): + side_effect = self.side_effect + chain_side_effects = self.chain_side_effects + process_closing = self.process_closing + + format_name = "CVAT for images 1.1" + + def _export( + *_, task_id: int, result_queue: multiprocessing.Queue, sleep_on_export: bool = False + ): + from os import replace as original_replace + + from cvat.apps.dataset_manager.task import export_task as original_export_task + + lock_ttl = 2 + lock_acquire_timeout = lock_ttl * 2 + + with ( + patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=lock_ttl), + patch( + "cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT", + new=lock_acquire_timeout, + ), + patch( + "cvat.apps.dataset_manager.views.get_export_cache_lock", + new=self.patched_get_export_cache_lock, + ), + patch("cvat.apps.dataset_manager.views.os.replace") as mock_os_replace, + patch( + "cvat.apps.dataset_manager.views.rq.get_current_job" + ) as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), + patch("cvat.apps.dataset_manager.views.task.export_task") as mock_export_task_func, + ): + mock_export_task_func.side_effect = chain_side_effects( + original_export_task, + side_effect(sleep, lock_acquire_timeout + 1 if sleep_on_export else 0), + ) + mock_os_replace.side_effect = original_replace + + mock_rq_get_current_job.return_value = MagicMock(timeout=5) + result_file_path = export(dst_format=format_name, task_id=task_id) + result_queue.put(result_file_path) + + mock_os_replace.assert_called_once() + + task = self._setup_task_with_annotations(format_name=format_name) + + with ExitStack() as es: + result_queue = multiprocessing.Queue() + number_of_processes = 2 + export_process_1 = es.enter_context( + process_closing( + multiprocessing.Process( + target=_export, + kwargs=dict( + task_id=task["id"], result_queue=result_queue, sleep_on_export=True + ), + ) + ) + ) + export_process_2 = es.enter_context( + process_closing( + multiprocessing.Process( + target=_export, + kwargs=dict(task_id=task["id"], result_queue=result_queue), + ) + ) + ) + + export_process_1.start() + export_process_2.start() + export_process_2.join(timeout=10) + export_process_1.join(timeout=10) + + self.assertFalse(export_process_1.is_alive()) + self.assertFalse(export_process_2.is_alive()) + + self.assertEqual(export_process_1.exitcode, 0) + self.assertEqual(export_process_2.exitcode, 0) + paths = {result_queue.get() for _ in range(number_of_processes)} + result_queue.close() + + self.assertTrue(len(paths) == 1) + self.assertNotEqual(paths, {None}) + self.assertTrue(osp.isfile(list(paths)[0])) + def test_cleanup_can_remove_file(self): format_name = "CVAT for images 1.1" - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) export_path = export(dst_format=format_name, task_id=task_id) with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), - patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), + patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)}), ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) @@ -1925,15 +2019,14 @@ def test_cleanup_can_remove_file(self): def test_cleanup_can_request_retry_on_locking_failure(self): format_name = "CVAT for images 1.1" - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] from cvat.apps.dataset_manager.util import LockNotAvailableError + with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) @@ -1941,11 +2034,11 @@ def test_cleanup_can_request_retry_on_locking_failure(self): with ( patch( - 'cvat.apps.dataset_manager.views.get_export_cache_lock', - side_effect=LockNotAvailableError + "cvat.apps.dataset_manager.views.get_export_cache_lock", + side_effect=LockNotAvailableError, ) as mock_get_export_cache_lock, - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), self.assertRaises(LockNotAvailableError), ): mock_rq_job = MagicMock(timeout=5) @@ -1960,8 +2053,8 @@ def test_cleanup_can_request_retry_on_locking_failure(self): def test_cleanup_can_fail_if_no_file(self): with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), self.assertRaises(FileNotFoundError), ): mock_rq_job = MagicMock(timeout=5) @@ -1971,23 +2064,22 @@ def test_cleanup_can_fail_if_no_file(self): def test_cleanup_can_defer_removal_if_file_is_used_recently(self): format_name = "CVAT for images 1.1" - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) export_path = export(dst_format=format_name, task_id=task_id) from cvat.apps.dataset_manager.views import FileIsBeingUsedError + with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(hours=1)}), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(hours=1)}), self.assertRaises(FileIsBeingUsedError), ): mock_rq_job = MagicMock(timeout=5) @@ -2006,14 +2098,12 @@ def test_cleanup_can_be_called_with_old_signature_and_values(self): # Jobs referring to the old API can exist in the redis queues after the server is updated format_name = "CVAT for images 1.1" - images = self._generate_task_images(3) - task = self._create_task(tasks["main"], images) - self._create_annotations(task, f'{format_name} many jobs', "default") + task = self._setup_task_with_annotations(format_name=format_name) task_id = task["id"] with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.django_rq.get_scheduler'), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) @@ -2027,14 +2117,14 @@ def test_cleanup_can_be_called_with_old_signature_and_values(self): shutil.move(new_export_path, old_export_path) old_kwargs = { - 'file_path': old_export_path, - 'file_ctime': file_ctime, - 'logger': MagicMock(), + "file_path": old_export_path, + "file_ctime": file_ctime, + "logger": MagicMock(), } with ( - patch('cvat.apps.dataset_manager.views.rq.get_current_job') as mock_rq_get_current_job, - patch('cvat.apps.dataset_manager.views.TTL_CONSTS', new={'task': timedelta(seconds=0)}), + patch("cvat.apps.dataset_manager.views.rq.get_current_job") as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.TTL_CONSTS", new={"task": timedelta(seconds=0)}), ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 2d735d2f75dd..a4bbe940b9f7 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -11,6 +11,7 @@ import django_rq import rq +from os.path import exists as osp_exists from django.conf import settings from django.utils import timezone from rq_scheduler import Scheduler @@ -161,7 +162,7 @@ def export( ttl=EXPORT_LOCK_TTL, acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, ): - if osp.exists(output_path): + if osp_exists(output_path): # Update last update time to prolong the export lifetime # and postpone the file deleting by the cleaning job os.utime(output_path, None) From 58b2a94922612341c3160a13561fb66ded4e2774 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 6 Dec 2024 12:40:06 +0100 Subject: [PATCH 12/32] Small improvements --- cvat/apps/dataset_manager/default_settings.py | 16 ++++++++-------- .../tests/test_rest_api_formats.py | 8 ++++---- cvat/apps/dataset_manager/util.py | 12 +++++++----- cvat/apps/dataset_manager/views.py | 10 +++++----- cvat/apps/engine/background.py | 8 +++++--- 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index 2218a220d8b0..94feb94c3cab 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -15,21 +15,21 @@ DATASET_EXPORT_LOCK_TTL = int(os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL", default_dataset_export_lock_ttl)) "Default lifetime for the export cache lock, in seconds." -DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") +DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") "Timeout for cache lock acquiring, in seconds" -if DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT is not None: - DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = int(DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT) +if DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT is not None: + DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT = int(DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT) warnings.warn( "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " - "use DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT instead", DeprecationWarning) + "use CVAT_DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT instead", DeprecationWarning) else: - default_dataset_lock_acquire_timeout = 2 * default_dataset_export_lock_ttl - DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT = int(os.getenv("DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT", default_dataset_lock_acquire_timeout)) + default_dataset_lock_acquire_timeout = 50 + DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT = int(os.getenv("CVAT_DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT", default_dataset_lock_acquire_timeout)) -if DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT <= DATASET_EXPORT_LOCK_TTL: +if DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT <= DATASET_EXPORT_LOCK_TTL: raise ImproperlyConfigured( - "Lock acquire timeout must be more than lock TTL" + "Lock acquisition timeout must be more than lock TTL" ) DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 20e981951aed..512c5984e791 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1461,7 +1461,7 @@ def patched_log_exception(logger=None, exc_info=True): with ( patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=5), - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT", new=10), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=10), patch( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, @@ -1498,7 +1498,7 @@ def _clear(*_, file_path: str, file_ctime: str): with ( patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=5), - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT", new=10), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=10), patch( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, @@ -1692,7 +1692,7 @@ def _clear(*_, file_path: str, file_ctime: str): from cvat.apps.dataset_manager.util import LockNotAvailableError with ( - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT", new=3), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=3), patch( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, @@ -1924,7 +1924,7 @@ def _export( with ( patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=lock_ttl), patch( - "cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT", + "cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=lock_acquire_timeout, ), patch( diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index 184849a3f634..f4af6f07e313 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -111,15 +111,17 @@ def get_export_cache_lock( *, ttl: int | timedelta, block: bool = True, - # endless waiting for the lock should be avoided at least by default - acquire_timeout: int | timedelta | None, + acquire_timeout: int | timedelta, ) -> Generator[Lock, Any, Any]: + if acquire_timeout is None: + raise ValueError("Endless waiting for the lock should be avoided") + if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() - if acquire_timeout is not None and acquire_timeout < 0: + + if acquire_timeout < 0: raise ValueError("acquire_timeout must be a non-negative number") - elif acquire_timeout is None: - acquire_timeout = -1 + if isinstance(ttl, timedelta): ttl = ttl.total_seconds() diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index a4bbe940b9f7..f5517fab6de7 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -53,7 +53,7 @@ def log_exception(logger: logging.Logger | None = None, exc_info: bool = True): 'job': JOB_CACHE_TTL, } -EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_ACQUIRE_TIMEOUT) +EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT) EXPORT_LOCKED_RETRY_INTERVAL = timedelta(seconds=settings.DATASET_EXPORT_LOCKED_RETRY_INTERVAL) EXPORT_LOCK_TTL = timedelta(seconds=settings.DATASET_EXPORT_LOCK_TTL) @@ -160,10 +160,10 @@ def export( with get_export_cache_lock( output_path, ttl=EXPORT_LOCK_TTL, - acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ): if osp_exists(output_path): - # Update last update time to prolong the export lifetime + # Update last modification time to prolong the export lifetime # and postpone the file deleting by the cleaning job os.utime(output_path, None) return output_path @@ -175,7 +175,7 @@ def export( with get_export_cache_lock( output_path, ttl=EXPORT_LOCK_TTL, - acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ): os.replace(temp_file, output_path) @@ -244,7 +244,7 @@ def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger with get_export_cache_lock( file_path, block=True, - acquire_timeout=EXPORT_CACHE_LOCK_ACQUIRE_TIMEOUT, + acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ttl=EXPORT_LOCK_TTL, ): if not osp.exists(file_path): diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 7cc494518db7..101ef22d43e7 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -53,9 +53,9 @@ slogger = ServerLogManager(__name__) REQUEST_TIMEOUT = 60 -# it's better to return LockNotAvailableError instead of 502 error TODO: (or 504?) -LOCK_ACQUIRE_TIMEOUT = REQUEST_TIMEOUT - 2 +# it's better to return LockNotAvailableError instead of response with 504 status LOCK_TTL = REQUEST_TIMEOUT - 5 +LOCK_ACQUIRE_TIMEOUT = LOCK_TTL - 5 class _ResourceExportManager(ABC): QUEUE_NAME = settings.CVAT_QUEUES.EXPORT_DATA.value @@ -230,7 +230,9 @@ def is_result_outdated() -> bool: return rq_job.meta[RQJobMetaField.REQUEST]["timestamp"] < instance_update_time def handle_local_download() -> Response: - with dm.util.get_export_cache_lock(file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT): + with dm.util.get_export_cache_lock( + file_path, ttl=LOCK_TTL, acquire_timeout=LOCK_ACQUIRE_TIMEOUT + ): if not osp.exists(file_path): return Response( "The exported file has expired, please retry exporting", From 015c39c53916c9fee89097a886a80add95cd2a9e Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 6 Dec 2024 14:00:16 +0100 Subject: [PATCH 13/32] Mask scheduled status as queued --- cvat/apps/engine/serializers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cvat/apps/engine/serializers.py b/cvat/apps/engine/serializers.py index f8678248d2b8..c6d1b578c7b0 100644 --- a/cvat/apps/engine/serializers.py +++ b/cvat/apps/engine/serializers.py @@ -3281,7 +3281,8 @@ def get_message(self, rq_job: RQJob) -> str: def to_representation(self, rq_job: RQJob) -> Dict[str, Any]: representation = super().to_representation(rq_job) - if representation["status"] == RQJobStatus.DEFERRED: + # FUTURE-TODO: support such statuses on UI + if representation["status"] in (RQJobStatus.DEFERRED, RQJobStatus.SCHEDULED): representation["status"] = RQJobStatus.QUEUED if representation["status"] == RQJobStatus.FINISHED: From 2bf9a0bae0e39fabdee9e9f4ca82b63cb3be7174 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Mon, 9 Dec 2024 11:43:09 +0100 Subject: [PATCH 14/32] Update cvat/apps/dataset_manager/views.py --- cvat/apps/dataset_manager/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index f5517fab6de7..2798ae0052c9 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -38,7 +38,7 @@ def log_exception(logger: logging.Logger | None = None, exc_info: bool = True): if logger is None: - logger = slogger + logger = slogger.glob logger.exception("[%s @ %s]: exception occurred" % \ (_MODULE_NAME, current_function_name(2)), exc_info=exc_info) From fd8658ea55560789f369c6d3eb0af486141e4897 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 11 Dec 2024 10:43:39 +0100 Subject: [PATCH 15/32] Apply some comments --- .../dataset_manager/tests/test_rest_api_formats.py | 11 +++++------ cvat/apps/dataset_manager/util.py | 9 +++++++-- cvat/apps/dataset_manager/views.py | 8 +++----- cvat/apps/engine/background.py | 4 +--- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 512c5984e791..3d3e4ff86366 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1483,7 +1483,6 @@ def patched_log_exception(logger=None, exc_info=True): original_exists, side_effect(set_condition, export_checked_the_file), side_effect(wait_condition, clear_process_has_run), - side_effect(sleep, 1), ) mock_rq_get_current_job.return_value = MagicMock(timeout=5) @@ -1517,16 +1516,16 @@ def _clear(*_, file_path: str, file_ctime: str): ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) - set_condition(clear_process_has_run) - file_is_been_used_error_raised = False + file_is_being_used_error_raised = False try: + set_condition(clear_process_has_run) clear_export_cache( file_path=file_path, file_ctime=file_ctime, logger=MagicMock() ) except FileIsBeingUsedError: - file_is_been_used_error_raised = True + file_is_being_used_error_raised = True - assert file_is_been_used_error_raised + assert file_is_being_used_error_raised mock_os_remove.assert_not_called() # The problem checked is TOCTOU / race condition for file existence check and @@ -1538,7 +1537,7 @@ def _clear(*_, file_path: str, file_ctime: str): # 4. remove removes the actual export file # Thus, we have no exported file after the successful export. - # note: it is not possibel to achive a situation + # note: it is not possible to achieve the situation # when clear process deletes newly "re-created by export process" # file instead of the checked one since file names contain a timestamp. diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index f4af6f07e313..d099686223c2 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -113,8 +113,7 @@ def get_export_cache_lock( block: bool = True, acquire_timeout: int | timedelta, ) -> Generator[Lock, Any, Any]: - if acquire_timeout is None: - raise ValueError("Endless waiting for the lock should be avoided") + assert acquire_timeout is not None, "Endless waiting for the lock should be avoided" if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() @@ -236,3 +235,9 @@ def parse_export_file_path(file_path: os.PathLike[str]) -> ParsedExportFilename: format_repr=basename_match.group('format_tag'), file_ext=basename_match.group('file_ext'), ) + +def extend_export_file_lifetime(file_path: str): + # Update the last modification time to extend the export's lifetime, + # as the last access time is not available on every filesystem. + # As a result, file deletion by the cleaning job will be postponed. + os.utime(file_path, None) \ No newline at end of file diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 2798ae0052c9..7252d6cc33af 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -27,7 +27,7 @@ LockNotAvailableError, current_function_name, get_export_cache_lock, get_export_cache_dir, make_export_filename, - parse_export_file_path + parse_export_file_path, extend_export_file_lifetime ) from .util import EXPORT_CACHE_DIR_NAME # pylint: disable=unused-import @@ -163,9 +163,7 @@ def export( acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ): if osp_exists(output_path): - # Update last modification time to prolong the export lifetime - # and postpone the file deleting by the cleaning job - os.utime(output_path, None) + extend_export_file_lifetime(output_path) return output_path with tempfile.TemporaryDirectory(dir=cache_dir) as temp_dir: @@ -193,7 +191,7 @@ def export( "and available for downloading for the next {}. " "Export cache cleaning job is enqueued, id '{}'".format( db_instance.__class__.__name__.lower(), - db_instance.name if isinstance(db_instance, (Project, Task)) else db_instance.id, + db_instance.id, dst_format, output_path, cache_ttl, diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 101ef22d43e7..4d61d3380d24 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -329,9 +329,7 @@ def handle_local_download() -> Response: acquire_timeout=LOCK_ACQUIRE_TIMEOUT, ): if osp.exists(file_path) and not is_result_outdated(): - # Update last update time to prolong the export lifetime - # as the last access time is not available on every filesystem - os.utime(file_path, None) + dm.util.extend_export_file_lifetime(file_path) return Response(status=status.HTTP_201_CREATED) From 8d050c017466ab6edf815a7ab6f5c14b5c5badf3 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 11 Dec 2024 12:31:27 +0100 Subject: [PATCH 16/32] Fix pylint issue --- cvat/apps/dataset_manager/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/util.py b/cvat/apps/dataset_manager/util.py index d099686223c2..6d814ec2c679 100644 --- a/cvat/apps/dataset_manager/util.py +++ b/cvat/apps/dataset_manager/util.py @@ -240,4 +240,4 @@ def extend_export_file_lifetime(file_path: str): # Update the last modification time to extend the export's lifetime, # as the last access time is not available on every filesystem. # As a result, file deletion by the cleaning job will be postponed. - os.utime(file_path, None) \ No newline at end of file + os.utime(file_path, None) From 903874154ce1690fac8d2a5d55be18033c3eaa2a Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 12 Dec 2024 13:40:45 +0100 Subject: [PATCH 17/32] Update test_concurrent_export_and_cleanup --- .../tests/test_rest_api_formats.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 3d3e4ff86366..bbf6dec0089b 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1437,8 +1437,10 @@ def test_concurrent_export_and_cleanup(self): format_name = "CVAT for images 1.1" + export_file_path = self.SharedString() export_checked_the_file = self.SharedBool() clear_process_has_run = self.SharedBool() + clear_removed_the_file = self.SharedBool() export_outdated_after = timedelta(seconds=1) def _export(*_, task_id: int): @@ -1473,10 +1475,6 @@ def patched_log_exception(logger=None, exc_info=True): patch( "cvat.apps.dataset_manager.views.os.replace", side_effect=original_replace ) as mock_os_replace, - patch( - "cvat.apps.dataset_manager.views.rq.get_current_job" - ) as mock_rq_get_current_job, - patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), patch("cvat.apps.dataset_manager.views.log_exception", new=patched_log_exception), ): mock_osp_exists.side_effect = chain_side_effects( @@ -1484,10 +1482,8 @@ def patched_log_exception(logger=None, exc_info=True): side_effect(set_condition, export_checked_the_file), side_effect(wait_condition, clear_process_has_run), ) - - mock_rq_get_current_job.return_value = MagicMock(timeout=5) - - export(dst_format=format_name, task_id=task_id) + result_file = export(dst_format=format_name, task_id=task_id) + set_condition(export_file_path, result_file) mock_os_replace.assert_not_called() def _clear(*_, file_path: str, file_ctime: str): @@ -1503,7 +1499,7 @@ def _clear(*_, file_path: str, file_ctime: str): new=self.patched_get_export_cache_lock, ), patch( - "cvat.apps.dataset_manager.views.os.remove", side_effect=original_remove + "cvat.apps.dataset_manager.views.os.remove" ) as mock_os_remove, patch( "cvat.apps.dataset_manager.views.rq.get_current_job" @@ -1515,6 +1511,10 @@ def _clear(*_, file_path: str, file_ctime: str): ), ): mock_rq_get_current_job.return_value = MagicMock(timeout=5) + mock_os_remove.side_effect = chain_side_effects( + original_remove, + side_effect(set_condition, clear_removed_the_file), + ) file_is_being_used_error_raised = False try: @@ -1563,7 +1563,6 @@ def _clear(*_, file_path: str, file_ctime: str): # create a file in the export cache first_export_path = export(dst_format=format_name, task_id=task_id) - self.assertTrue(osp.isfile(first_export_path)) initial_file_modfication_time = os.path.getmtime(first_export_path) # make sure that a file in the export cache is outdated by timeout @@ -1613,8 +1612,6 @@ def _clear(*_, file_path: str, file_ctime: str): # clear() must wait for the export cache lock release (acquired by export()). # It must be finished by a timeout, as export() holds it, waiting clear_process.join(timeout=10) - - # export() must wait for the clear() file existence check and fail because of timeout export_process.join(timeout=10) self.assertFalse(export_process.is_alive()) @@ -1628,6 +1625,12 @@ def _clear(*_, file_path: str, file_ctime: str): processes_finished_correctly = True self.assertTrue(processes_finished_correctly) + self.assertFalse(clear_removed_the_file.get()) + + new_export_path = export_file_path.get() + self.assertGreater(len(new_export_path), 0) + self.assertTrue(osp.isfile(new_export_path)) + self.assertTrue(osp.isfile(first_export_path)) self.assertGreater(os.path.getmtime(first_export_path), initial_file_modfication_time) # terminate() may break the locks, don't try to acquire From a52751b6221614cd79aa2dc6bdfe32ad70727f6e Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 12 Dec 2024 19:31:28 +0100 Subject: [PATCH 18/32] Update test_initiate_concurrent_export_by_different_users --- .../tests/test_rest_api_formats.py | 50 +++++++++++++++---- 1 file changed, 39 insertions(+), 11 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index bbf6dec0089b..adfed85b6203 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1910,11 +1910,25 @@ def test_initiate_concurrent_export_by_different_users(self): side_effect = self.side_effect chain_side_effects = self.chain_side_effects process_closing = self.process_closing + wait_condition = self.wait_condition + set_condition = self.set_condition + + export_fn_started_by_1st_process = self.SharedBool() + export_fn_finished_by_1st_process = self.SharedBool() + export_fn_started_by_2nd_process = self.SharedBool() + export_fn_finished_by_2nd_process = self.SharedBool() + format_name = "CVAT for images 1.1" def _export( - *_, task_id: int, result_queue: multiprocessing.Queue, sleep_on_export: bool = False + *_, + task_id: int, + result_queue: multiprocessing.Queue, + export_fn_started: self.SharedBool, + export_fn_finished: self.SharedBool | None = None, + wait_before_fn_finish: self.SharedBool | None = None, + wait_before_replace: self.SharedBool | None = None, ): from os import replace as original_replace @@ -1934,22 +1948,24 @@ def _export( new=self.patched_get_export_cache_lock, ), patch("cvat.apps.dataset_manager.views.os.replace") as mock_os_replace, - patch( - "cvat.apps.dataset_manager.views.rq.get_current_job" - ) as mock_rq_get_current_job, + patch("cvat.apps.dataset_manager.views.task.export_task") as mock_export_fn, patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), - patch("cvat.apps.dataset_manager.views.task.export_task") as mock_export_task_func, ): - mock_export_task_func.side_effect = chain_side_effects( + mock_export_fn.side_effect = chain_side_effects( + side_effect(set_condition, export_fn_started), original_export_task, - side_effect(sleep, lock_acquire_timeout + 1 if sleep_on_export else 0), + *((side_effect(wait_condition, wait_before_fn_finish),) if wait_before_fn_finish is not None else []), + *((side_effect(set_condition, export_fn_finished),) if export_fn_finished is not None else []), ) - mock_os_replace.side_effect = original_replace - mock_rq_get_current_job.return_value = MagicMock(timeout=5) + mock_os_replace.side_effect = chain_side_effects( + *((side_effect(wait_condition, wait_before_replace),) if wait_before_replace is not None else []), + original_replace + ) result_file_path = export(dst_format=format_name, task_id=task_id) result_queue.put(result_file_path) + mock_export_fn.assert_called_once() mock_os_replace.assert_called_once() task = self._setup_task_with_annotations(format_name=format_name) @@ -1962,7 +1978,12 @@ def _export( multiprocessing.Process( target=_export, kwargs=dict( - task_id=task["id"], result_queue=result_queue, sleep_on_export=True + task_id=task["id"], + result_queue=result_queue, + export_fn_started=export_fn_started_by_1st_process, + wait_before_fn_finish=export_fn_started_by_2nd_process, + export_fn_finished=export_fn_finished_by_1st_process, + wait_before_replace=export_fn_finished_by_2nd_process, ), ) ) @@ -1971,12 +1992,19 @@ def _export( process_closing( multiprocessing.Process( target=_export, - kwargs=dict(task_id=task["id"], result_queue=result_queue), + kwargs=dict( + task_id=task["id"], + result_queue=result_queue, + export_fn_started=export_fn_started_by_2nd_process, + export_fn_finished=export_fn_finished_by_2nd_process, + ), ) ) ) export_process_1.start() + wait_condition(export_fn_started_by_1st_process) + export_process_2.start() export_process_2.join(timeout=10) export_process_1.join(timeout=10) From 9dcc135b1995091e0c9135ccb5a5c78316f6e31c Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 13 Dec 2024 10:51:49 +0100 Subject: [PATCH 19/32] Fix tests --- .../tests/test_rest_api_formats.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index adfed85b6203..fb11f1cb6c99 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1322,18 +1322,19 @@ def setUp(self): self.export_cache_lock = multiprocessing.Lock() @contextmanager - def patched_get_export_cache_lock(self, export_path, *, ttl, block=True, acquire_timeout=None): + def patched_get_export_cache_lock(self, export_path, *, ttl: int | timedelta, block: bool = True, acquire_timeout: int | timedelta): # fakeredis lock acquired in a subprocess won't be visible to other processes # just implement the lock here from cvat.apps.dataset_manager.util import LockNotAvailableError + assert acquire_timeout + assert ttl + if isinstance(acquire_timeout, timedelta): acquire_timeout = acquire_timeout.total_seconds() - if acquire_timeout is None: - acquire_timeout = -1 acquired = self.export_cache_lock.acquire( - block=block, timeout=acquire_timeout if acquire_timeout > -1 else None + block=block, timeout=acquire_timeout ) if not acquired: @@ -1481,6 +1482,7 @@ def patched_log_exception(logger=None, exc_info=True): original_exists, side_effect(set_condition, export_checked_the_file), side_effect(wait_condition, clear_process_has_run), + side_effect(sleep, 2), ) result_file = export(dst_format=format_name, task_id=task_id) set_condition(export_file_path, result_file) @@ -1611,8 +1613,8 @@ def _clear(*_, file_path: str, file_ctime: str): # if the problem is fixed. # clear() must wait for the export cache lock release (acquired by export()). # It must be finished by a timeout, as export() holds it, waiting - clear_process.join(timeout=10) - export_process.join(timeout=10) + clear_process.join(timeout=15) + export_process.join(timeout=15) self.assertFalse(export_process.is_alive()) self.assertFalse(clear_process.is_alive()) @@ -1794,7 +1796,7 @@ def patched_export(*args, **kwargs): # if the problem is fixed. # clear() must wait for the export cache lock release (acquired by download()). # It must be finished by a timeout, as download() holds it, waiting - clear_process.join(timeout=5) + clear_process.join(timeout=10) # download() must wait for the clear() file existence check and fail because of timeout download_process.join(timeout=5) @@ -1954,12 +1956,12 @@ def _export( mock_export_fn.side_effect = chain_side_effects( side_effect(set_condition, export_fn_started), original_export_task, - *((side_effect(wait_condition, wait_before_fn_finish),) if wait_before_fn_finish is not None else []), + *((side_effect(wait_condition, wait_before_fn_finish, 10),) if wait_before_fn_finish is not None else []), *((side_effect(set_condition, export_fn_finished),) if export_fn_finished is not None else []), ) mock_os_replace.side_effect = chain_side_effects( - *((side_effect(wait_condition, wait_before_replace),) if wait_before_replace is not None else []), + *((side_effect(wait_condition, wait_before_replace, 10),) if wait_before_replace is not None else []), original_replace ) result_file_path = export(dst_format=format_name, task_id=task_id) From 116a9bcab219b9f270bd17debe2fb0c3ef763db8 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 13 Dec 2024 12:50:01 +0100 Subject: [PATCH 20/32] Try to fix test_initiate_concurrent_export_by_different_users on CI --- .../tests/test_rest_api_formats.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index fb11f1cb6c99..42cc334e06c8 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1916,7 +1916,7 @@ def test_initiate_concurrent_export_by_different_users(self): set_condition = self.set_condition export_fn_started_by_1st_process = self.SharedBool() - export_fn_finished_by_1st_process = self.SharedBool() + file_replacement_by_1st_process = self.SharedBool() export_fn_started_by_2nd_process = self.SharedBool() export_fn_finished_by_2nd_process = self.SharedBool() @@ -1931,12 +1931,13 @@ def _export( export_fn_finished: self.SharedBool | None = None, wait_before_fn_finish: self.SharedBool | None = None, wait_before_replace: self.SharedBool | None = None, + file_replacement: self.SharedBool | None = None, ): from os import replace as original_replace from cvat.apps.dataset_manager.task import export_task as original_export_task - lock_ttl = 2 + lock_ttl = 4 lock_acquire_timeout = lock_ttl * 2 with ( @@ -1956,13 +1957,14 @@ def _export( mock_export_fn.side_effect = chain_side_effects( side_effect(set_condition, export_fn_started), original_export_task, - *((side_effect(wait_condition, wait_before_fn_finish, 10),) if wait_before_fn_finish is not None else []), *((side_effect(set_condition, export_fn_finished),) if export_fn_finished is not None else []), + *((side_effect(wait_condition, wait_before_fn_finish, 10),) if wait_before_fn_finish is not None else []), ) mock_os_replace.side_effect = chain_side_effects( *((side_effect(wait_condition, wait_before_replace, 10),) if wait_before_replace is not None else []), - original_replace + original_replace, + *((side_effect(set_condition, file_replacement),) if file_replacement is not None else []), ) result_file_path = export(dst_format=format_name, task_id=task_id) result_queue.put(result_file_path) @@ -1984,8 +1986,8 @@ def _export( result_queue=result_queue, export_fn_started=export_fn_started_by_1st_process, wait_before_fn_finish=export_fn_started_by_2nd_process, - export_fn_finished=export_fn_finished_by_1st_process, wait_before_replace=export_fn_finished_by_2nd_process, + file_replacement=file_replacement_by_1st_process, ), ) ) @@ -1999,6 +2001,7 @@ def _export( result_queue=result_queue, export_fn_started=export_fn_started_by_2nd_process, export_fn_finished=export_fn_finished_by_2nd_process, + wait_before_fn_finish=file_replacement_by_1st_process, ), ) ) @@ -2008,8 +2011,8 @@ def _export( wait_condition(export_fn_started_by_1st_process) export_process_2.start() - export_process_2.join(timeout=10) - export_process_1.join(timeout=10) + export_process_2.join(timeout=20) + export_process_1.join(timeout=20) self.assertFalse(export_process_1.is_alive()) self.assertFalse(export_process_2.is_alive()) From 39e38c47c6d45eb14d93565ed26e2092ffe8f331 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Fri, 13 Dec 2024 13:26:05 +0100 Subject: [PATCH 21/32] t --- cvat/apps/dataset_manager/tests/test_rest_api_formats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 42cc334e06c8..942a139d1154 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1367,7 +1367,7 @@ def set_condition(cls, var: SharedBase, value: Any = _not_set): @classmethod def wait_condition(cls, var: SharedBase, timeout: Optional[int] = 5): with var.condition: - if not var.condition.wait(timeout): + if not var.get() and not var.condition.wait(timeout): raise cls._LockTimeoutError @staticmethod From 98a829b203b62faa157af9b11fa2b2fa47dcde84 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 11:57:48 +0100 Subject: [PATCH 22/32] Update tests --- .../tests/test_rest_api_formats.py | 119 +++++++++++------- cvat/apps/engine/background.py | 1 + 2 files changed, 78 insertions(+), 42 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 942a139d1154..f90ddc20e434 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1440,10 +1440,13 @@ def test_concurrent_export_and_cleanup(self): export_file_path = self.SharedString() export_checked_the_file = self.SharedBool() - clear_process_has_run = self.SharedBool() + clear_has_been_finished = self.SharedBool() clear_removed_the_file = self.SharedBool() export_outdated_after = timedelta(seconds=1) + EXPORT_LOCK_TTL = 4 + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = EXPORT_LOCK_TTL + 1 + def _export(*_, task_id: int): import sys from os import replace as original_replace @@ -1463,8 +1466,9 @@ def patched_log_exception(logger=None, exc_info=True): original_log_exception(logger, exc_info) with ( - patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=5), - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=10), + patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=EXPORT_LOCK_TTL), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", + new=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT), patch( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, @@ -1481,8 +1485,7 @@ def patched_log_exception(logger=None, exc_info=True): mock_osp_exists.side_effect = chain_side_effects( original_exists, side_effect(set_condition, export_checked_the_file), - side_effect(wait_condition, clear_process_has_run), - side_effect(sleep, 2), + side_effect(wait_condition, clear_has_been_finished), ) result_file = export(dst_format=format_name, task_id=task_id) set_condition(export_file_path, result_file) @@ -1491,11 +1494,11 @@ def patched_log_exception(logger=None, exc_info=True): def _clear(*_, file_path: str, file_ctime: str): from os import remove as original_remove - from cvat.apps.dataset_manager.views import FileIsBeingUsedError + from cvat.apps.dataset_manager.views import LockNotAvailableError with ( - patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=5), - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=10), + patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=EXPORT_LOCK_TTL), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_LOCK_TTL - 1), patch( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, @@ -1518,16 +1521,13 @@ def _clear(*_, file_path: str, file_ctime: str): side_effect(set_condition, clear_removed_the_file), ) - file_is_being_used_error_raised = False try: - set_condition(clear_process_has_run) clear_export_cache( file_path=file_path, file_ctime=file_ctime, logger=MagicMock() ) - except FileIsBeingUsedError: - file_is_being_used_error_raised = True + except LockNotAvailableError: + set_condition(clear_has_been_finished) - assert file_is_being_used_error_raised mock_os_remove.assert_not_called() # The problem checked is TOCTOU / race condition for file existence check and @@ -1915,36 +1915,73 @@ def test_initiate_concurrent_export_by_different_users(self): wait_condition = self.wait_condition set_condition = self.set_condition - export_fn_started_by_1st_process = self.SharedBool() - file_replacement_by_1st_process = self.SharedBool() - export_fn_started_by_2nd_process = self.SharedBool() - export_fn_finished_by_2nd_process = self.SharedBool() + export_1_checked_file = self.SharedBool() + export_1_made_export = self.SharedBool() + export_1_replaced_file = self.SharedBool() + export_2_checked_file = self.SharedBool() + export_2_made_export = self.SharedBool() + export_2_replaced_file = self.SharedBool() format_name = "CVAT for images 1.1" - def _export( + LOCK_TTL = 4 + LOCK_ACQUISITION_TIMEOUT = LOCK_TTL * 2 + + def _export_1( *_, task_id: int, result_queue: multiprocessing.Queue, - export_fn_started: self.SharedBool, - export_fn_finished: self.SharedBool | None = None, - wait_before_fn_finish: self.SharedBool | None = None, - wait_before_replace: self.SharedBool | None = None, - file_replacement: self.SharedBool | None = None, ): from os import replace as original_replace from cvat.apps.dataset_manager.task import export_task as original_export_task - lock_ttl = 4 - lock_acquire_timeout = lock_ttl * 2 + with ( + patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=LOCK_TTL), + patch( + "cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", + new=LOCK_ACQUISITION_TIMEOUT, + ), + patch( + "cvat.apps.dataset_manager.views.get_export_cache_lock", + new=self.patched_get_export_cache_lock, + ), + patch("cvat.apps.dataset_manager.views.os.replace") as mock_os_replace, + patch("cvat.apps.dataset_manager.views.task.export_task") as mock_export_fn, + patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), + ): + mock_export_fn.side_effect = chain_side_effects( + side_effect(set_condition, export_1_checked_file), + original_export_task, + side_effect(wait_condition, export_2_checked_file), + side_effect(set_condition, export_1_made_export), + ) + + mock_os_replace.side_effect = chain_side_effects( + original_replace, + side_effect(set_condition, export_1_replaced_file), + ) + result_file_path = export(dst_format=format_name, task_id=task_id) + result_queue.put(result_file_path) + + mock_export_fn.assert_called_once() + mock_os_replace.assert_called_once() + + def _export_2( + *_, + task_id: int, + result_queue: multiprocessing.Queue, + ): + from os import replace as original_replace + + from cvat.apps.dataset_manager.task import export_task as original_export_task with ( - patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=lock_ttl), + patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=LOCK_TTL), patch( "cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", - new=lock_acquire_timeout, + new=LOCK_ACQUISITION_TIMEOUT, ), patch( "cvat.apps.dataset_manager.views.get_export_cache_lock", @@ -1955,16 +1992,15 @@ def _export( patch("cvat.apps.dataset_manager.views.django_rq.get_scheduler"), ): mock_export_fn.side_effect = chain_side_effects( - side_effect(set_condition, export_fn_started), + side_effect(set_condition, export_2_checked_file), original_export_task, - *((side_effect(set_condition, export_fn_finished),) if export_fn_finished is not None else []), - *((side_effect(wait_condition, wait_before_fn_finish, 10),) if wait_before_fn_finish is not None else []), + side_effect(wait_condition, export_1_replaced_file), + side_effect(set_condition, export_2_made_export), ) mock_os_replace.side_effect = chain_side_effects( - *((side_effect(wait_condition, wait_before_replace, 10),) if wait_before_replace is not None else []), original_replace, - *((side_effect(set_condition, file_replacement),) if file_replacement is not None else []), + side_effect(set_condition, export_2_replaced_file), ) result_file_path = export(dst_format=format_name, task_id=task_id) result_queue.put(result_file_path) @@ -1980,14 +2016,10 @@ def _export( export_process_1 = es.enter_context( process_closing( multiprocessing.Process( - target=_export, + target=_export_1, kwargs=dict( task_id=task["id"], result_queue=result_queue, - export_fn_started=export_fn_started_by_1st_process, - wait_before_fn_finish=export_fn_started_by_2nd_process, - wait_before_replace=export_fn_finished_by_2nd_process, - file_replacement=file_replacement_by_1st_process, ), ) ) @@ -1995,20 +2027,17 @@ def _export( export_process_2 = es.enter_context( process_closing( multiprocessing.Process( - target=_export, + target=_export_2, kwargs=dict( task_id=task["id"], result_queue=result_queue, - export_fn_started=export_fn_started_by_2nd_process, - export_fn_finished=export_fn_finished_by_2nd_process, - wait_before_fn_finish=file_replacement_by_1st_process, ), ) ) ) export_process_1.start() - wait_condition(export_fn_started_by_1st_process) + wait_condition(export_1_checked_file) export_process_2.start() export_process_2.join(timeout=20) @@ -2026,6 +2055,12 @@ def _export( self.assertNotEqual(paths, {None}) self.assertTrue(osp.isfile(list(paths)[0])) + for cond in ( + export_1_checked_file, export_1_made_export, export_1_replaced_file, + export_2_checked_file, export_2_made_export, export_2_replaced_file + ): + self.assertTrue(cond.get()) + def test_cleanup_can_remove_file(self): format_name = "CVAT for images 1.1" task = self._setup_task_with_annotations(format_name=format_name) diff --git a/cvat/apps/engine/background.py b/cvat/apps/engine/background.py index 4d61d3380d24..d9f9237e6d27 100644 --- a/cvat/apps/engine/background.py +++ b/cvat/apps/engine/background.py @@ -57,6 +57,7 @@ LOCK_TTL = REQUEST_TIMEOUT - 5 LOCK_ACQUIRE_TIMEOUT = LOCK_TTL - 5 + class _ResourceExportManager(ABC): QUEUE_NAME = settings.CVAT_QUEUES.EXPORT_DATA.value From 458e330ed64c5f0c36346b4cd797f88246282a5a Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 12:14:50 +0100 Subject: [PATCH 23/32] Add changelog --- ...0241218_120228_maria_fix_export_job_lock_releasing.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md diff --git a/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md b/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md new file mode 100644 index 000000000000..75b0455bf024 --- /dev/null +++ b/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md @@ -0,0 +1,9 @@ +### Fixed + +- Acquiring a long lock with a worker TTL could result in the lock not being released, + blocking further exports with the same parameters () +- Scheduled RQ jobs could not be restarted due to incorrect RQ job status + updating and handling () +- Parallel exports with the same parameters by different users caused + a LockNotAvailableError to be raised for all users except the one + who initiated the export first () From da399e648042f16fda794bc697deb912097f90ad Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 13:06:00 +0100 Subject: [PATCH 24/32] Rename settings --- cvat/apps/dataset_manager/default_settings.py | 39 ++++++++++++++----- .../tests/test_rest_api_formats.py | 14 +++---- cvat/apps/dataset_manager/views.py | 12 +++--- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index 94feb94c3cab..91ab3b6a9c87 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -10,27 +10,46 @@ DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 24)) "Base lifetime for cached exported datasets, in seconds" -default_dataset_export_lock_ttl = 30 +default_export_cache_lock_ttl = 30 -DATASET_EXPORT_LOCK_TTL = int(os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL", default_dataset_export_lock_ttl)) +EXPORT_CACHE_LOCK_TTL = os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL") "Default lifetime for the export cache lock, in seconds." -DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") +if EXPORT_CACHE_LOCK_TTL is not None: + EXPORT_CACHE_LOCK_TTL = int(EXPORT_CACHE_LOCK_TTL) + warnings.warn( + "The CVAT_DATASET_EXPORT_LOCK_TTL is deprecated, " + "use CVAT_EXPORT_CACHE_LOCK_TTL instead", DeprecationWarning + ) +else: + EXPORT_CACHE_LOCK_TTL = int(os.getenv("CVAT_EXPORT_CACHE_LOCK_TTL", default_export_cache_lock_ttl)) + +EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") "Timeout for cache lock acquiring, in seconds" -if DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT is not None: - DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT = int(DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT) +if EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT is not None: + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int(EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT) warnings.warn( "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " - "use CVAT_DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT instead", DeprecationWarning) + "use CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT instead", DeprecationWarning + ) else: - default_dataset_lock_acquire_timeout = 50 - DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT = int(os.getenv("CVAT_DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT", default_dataset_lock_acquire_timeout)) + default_export_cache_lock_acquire_timeout = 50 + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int(os.getenv("CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", default_export_cache_lock_acquire_timeout)) -if DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT <= DATASET_EXPORT_LOCK_TTL: +if EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT <= EXPORT_CACHE_LOCK_TTL: raise ImproperlyConfigured( "Lock acquisition timeout must be more than lock TTL" ) -DATASET_EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL", 60)) +EXPORT_LOCKED_RETRY_INTERVAL = os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL") "Retry interval for cases the export cache lock was unavailable, in seconds" + +if EXPORT_LOCKED_RETRY_INTERVAL is not None: + EXPORT_LOCKED_RETRY_INTERVAL = int(EXPORT_LOCKED_RETRY_INTERVAL) + warnings.warn( + "The CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL is deprecated, " + "use CVAT_EXPORT_LOCKED_RETRY_INTERVAL instead", DeprecationWarning + ) +else: + EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_EXPORT_LOCKED_RETRY_INTERVAL", 60)) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index f90ddc20e434..8757b481ace0 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1444,8 +1444,8 @@ def test_concurrent_export_and_cleanup(self): clear_removed_the_file = self.SharedBool() export_outdated_after = timedelta(seconds=1) - EXPORT_LOCK_TTL = 4 - EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = EXPORT_LOCK_TTL + 1 + EXPORT_CACHE_LOCK_TTL = 4 + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = EXPORT_CACHE_LOCK_TTL + 1 def _export(*_, task_id: int): import sys @@ -1466,7 +1466,7 @@ def patched_log_exception(logger=None, exc_info=True): original_log_exception(logger, exc_info) with ( - patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=EXPORT_LOCK_TTL), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TTL", new=EXPORT_CACHE_LOCK_TTL), patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT), patch( @@ -1497,8 +1497,8 @@ def _clear(*_, file_path: str, file_ctime: str): from cvat.apps.dataset_manager.views import LockNotAvailableError with ( - patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=EXPORT_LOCK_TTL), - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_LOCK_TTL - 1), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TTL", new=EXPORT_CACHE_LOCK_TTL), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_TTL - 1), patch( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, @@ -1938,7 +1938,7 @@ def _export_1( from cvat.apps.dataset_manager.task import export_task as original_export_task with ( - patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=LOCK_TTL), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TTL", new=LOCK_TTL), patch( "cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=LOCK_ACQUISITION_TIMEOUT, @@ -1978,7 +1978,7 @@ def _export_2( from cvat.apps.dataset_manager.task import export_task as original_export_task with ( - patch("cvat.apps.dataset_manager.views.EXPORT_LOCK_TTL", new=LOCK_TTL), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TTL", new=LOCK_TTL), patch( "cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=LOCK_ACQUISITION_TIMEOUT, diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 3c3d49719f5c..7cf64e2ff0fc 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -54,9 +54,9 @@ def log_exception(logger: logging.Logger | None = None, exc_info: bool = True): 'job': JOB_CACHE_TTL, } -EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = timedelta(seconds=settings.DATASET_CACHE_LOCK_ACQUISITION_TIMEOUT) -EXPORT_LOCKED_RETRY_INTERVAL = timedelta(seconds=settings.DATASET_EXPORT_LOCKED_RETRY_INTERVAL) -EXPORT_LOCK_TTL = timedelta(seconds=settings.DATASET_EXPORT_LOCK_TTL) +EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = timedelta(seconds=settings.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT) +EXPORT_CACHE_LOCK_TTL = timedelta(seconds=settings.EXPORT_CACHE_LOCK_TTL) +EXPORT_LOCKED_RETRY_INTERVAL = timedelta(seconds=settings.EXPORT_LOCKED_RETRY_INTERVAL) def get_export_cache_ttl(db_instance: str | Project | Task | Job) -> timedelta: @@ -160,7 +160,7 @@ def export( # 2. to create a file when it doesn't exist with get_export_cache_lock( output_path, - ttl=EXPORT_LOCK_TTL, + ttl=EXPORT_CACHE_LOCK_TTL, acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ): if osp_exists(output_path): @@ -173,7 +173,7 @@ def export( server_url=server_url, save_images=save_images) with get_export_cache_lock( output_path, - ttl=EXPORT_LOCK_TTL, + ttl=EXPORT_CACHE_LOCK_TTL, acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, ): os.replace(temp_file, output_path) @@ -244,7 +244,7 @@ def clear_export_cache(file_path: str, file_ctime: float, logger: logging.Logger file_path, block=True, acquire_timeout=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT, - ttl=EXPORT_LOCK_TTL, + ttl=EXPORT_CACHE_LOCK_TTL, ): if not osp.exists(file_path): raise FileNotFoundError("Export cache file '{}' doesn't exist".format(file_path)) From 0376d35b7ae11c3d8524609aa08557aa073f5f2d Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 13:10:28 +0100 Subject: [PATCH 25/32] Move settings into engine --- cvat/apps/dataset_manager/default_settings.py | 47 ----------------- cvat/apps/engine/default_settings.py | 52 +++++++++++++++++++ 2 files changed, 52 insertions(+), 47 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index 91ab3b6a9c87..10c234a1f356 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -3,53 +3,6 @@ # SPDX-License-Identifier: MIT import os -import warnings -from django.core.exceptions import ImproperlyConfigured - DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 24)) "Base lifetime for cached exported datasets, in seconds" - -default_export_cache_lock_ttl = 30 - -EXPORT_CACHE_LOCK_TTL = os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL") -"Default lifetime for the export cache lock, in seconds." - -if EXPORT_CACHE_LOCK_TTL is not None: - EXPORT_CACHE_LOCK_TTL = int(EXPORT_CACHE_LOCK_TTL) - warnings.warn( - "The CVAT_DATASET_EXPORT_LOCK_TTL is deprecated, " - "use CVAT_EXPORT_CACHE_LOCK_TTL instead", DeprecationWarning - ) -else: - EXPORT_CACHE_LOCK_TTL = int(os.getenv("CVAT_EXPORT_CACHE_LOCK_TTL", default_export_cache_lock_ttl)) - -EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") -"Timeout for cache lock acquiring, in seconds" - -if EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT is not None: - EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int(EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT) - warnings.warn( - "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " - "use CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT instead", DeprecationWarning - ) -else: - default_export_cache_lock_acquire_timeout = 50 - EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int(os.getenv("CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", default_export_cache_lock_acquire_timeout)) - -if EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT <= EXPORT_CACHE_LOCK_TTL: - raise ImproperlyConfigured( - "Lock acquisition timeout must be more than lock TTL" - ) - -EXPORT_LOCKED_RETRY_INTERVAL = os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL") -"Retry interval for cases the export cache lock was unavailable, in seconds" - -if EXPORT_LOCKED_RETRY_INTERVAL is not None: - EXPORT_LOCKED_RETRY_INTERVAL = int(EXPORT_LOCKED_RETRY_INTERVAL) - warnings.warn( - "The CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL is deprecated, " - "use CVAT_EXPORT_LOCKED_RETRY_INTERVAL instead", DeprecationWarning - ) -else: - EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_EXPORT_LOCKED_RETRY_INTERVAL", 60)) diff --git a/cvat/apps/engine/default_settings.py b/cvat/apps/engine/default_settings.py index 15e1b3fd8c32..9d80c29f01db 100644 --- a/cvat/apps/engine/default_settings.py +++ b/cvat/apps/engine/default_settings.py @@ -3,8 +3,10 @@ # SPDX-License-Identifier: MIT import os +import warnings from attrs.converters import to_bool +from django.core.exceptions import ImproperlyConfigured MEDIA_CACHE_ALLOW_STATIC_CACHE = to_bool(os.getenv("CVAT_ALLOW_STATIC_CACHE", False)) """ @@ -24,3 +26,53 @@ """ Sets the frequency of checking the readiness of the chunk """ + +default_export_cache_lock_ttl = 30 + +EXPORT_CACHE_LOCK_TTL = os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL") +"Default lifetime for the export cache lock, in seconds." + +if EXPORT_CACHE_LOCK_TTL is not None: + EXPORT_CACHE_LOCK_TTL = int(EXPORT_CACHE_LOCK_TTL) + warnings.warn( + "The CVAT_DATASET_EXPORT_LOCK_TTL is deprecated, " "use CVAT_EXPORT_CACHE_LOCK_TTL instead", + DeprecationWarning, + ) +else: + EXPORT_CACHE_LOCK_TTL = int( + os.getenv("CVAT_EXPORT_CACHE_LOCK_TTL", default_export_cache_lock_ttl) + ) + +EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = os.getenv("CVAT_DATASET_CACHE_LOCK_TIMEOUT") +"Timeout for cache lock acquiring, in seconds" + +if EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT is not None: + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int(EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT) + warnings.warn( + "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " + "use CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT instead", + DeprecationWarning, + ) +else: + default_export_cache_lock_acquire_timeout = 50 + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int( + os.getenv( + "CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", default_export_cache_lock_acquire_timeout + ) + ) + +if EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT <= EXPORT_CACHE_LOCK_TTL: + raise ImproperlyConfigured("Lock acquisition timeout must be more than lock TTL") + +EXPORT_LOCKED_RETRY_INTERVAL = os.getenv("CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL") +"Retry interval for cases the export cache lock was unavailable, in seconds" + +if EXPORT_LOCKED_RETRY_INTERVAL is not None: + EXPORT_LOCKED_RETRY_INTERVAL = int(EXPORT_LOCKED_RETRY_INTERVAL) + warnings.warn( + "The CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL is deprecated, " + "use CVAT_EXPORT_LOCKED_RETRY_INTERVAL instead", + DeprecationWarning, + ) +else: + EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_EXPORT_LOCKED_RETRY_INTERVAL", 60)) From 6f09d46014e61b623af37e3ac7fc8c11f199fe9f Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 13:13:39 +0100 Subject: [PATCH 26/32] Fix formatting --- cvat/apps/engine/default_settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cvat/apps/engine/default_settings.py b/cvat/apps/engine/default_settings.py index 9d80c29f01db..4a11d19794f9 100644 --- a/cvat/apps/engine/default_settings.py +++ b/cvat/apps/engine/default_settings.py @@ -35,7 +35,7 @@ if EXPORT_CACHE_LOCK_TTL is not None: EXPORT_CACHE_LOCK_TTL = int(EXPORT_CACHE_LOCK_TTL) warnings.warn( - "The CVAT_DATASET_EXPORT_LOCK_TTL is deprecated, " "use CVAT_EXPORT_CACHE_LOCK_TTL instead", + "The CVAT_DATASET_EXPORT_LOCK_TTL is deprecated, use CVAT_EXPORT_CACHE_LOCK_TTL instead", DeprecationWarning, ) else: From 2a3c19d368ca04ff7b0b930f563f7c6a8fb7a9bc Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 13:55:43 +0100 Subject: [PATCH 27/32] Update test_concurrent_export_and_cleanup --- .../dataset_manager/tests/test_rest_api_formats.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py index 8757b481ace0..50883826b5a5 100644 --- a/cvat/apps/dataset_manager/tests/test_rest_api_formats.py +++ b/cvat/apps/dataset_manager/tests/test_rest_api_formats.py @@ -1445,12 +1445,13 @@ def test_concurrent_export_and_cleanup(self): export_outdated_after = timedelta(seconds=1) EXPORT_CACHE_LOCK_TTL = 4 - EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = EXPORT_CACHE_LOCK_TTL + 1 + EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = EXPORT_CACHE_LOCK_TTL * 2 def _export(*_, task_id: int): import sys from os import replace as original_replace from os.path import exists as original_exists + from cvat.apps.dataset_manager.task import export_task as original_export_task from cvat.apps.dataset_manager.views import log_exception as original_log_exception @@ -1481,10 +1482,14 @@ def patched_log_exception(logger=None, exc_info=True): "cvat.apps.dataset_manager.views.os.replace", side_effect=original_replace ) as mock_os_replace, patch("cvat.apps.dataset_manager.views.log_exception", new=patched_log_exception), + patch("cvat.apps.dataset_manager.views.task.export_task") as mock_export_fn, ): mock_osp_exists.side_effect = chain_side_effects( original_exists, side_effect(set_condition, export_checked_the_file), + ) + mock_export_fn.side_effect = chain_side_effects( + original_export_task, side_effect(wait_condition, clear_has_been_finished), ) result_file = export(dst_format=format_name, task_id=task_id) @@ -1494,11 +1499,11 @@ def patched_log_exception(logger=None, exc_info=True): def _clear(*_, file_path: str, file_ctime: str): from os import remove as original_remove - from cvat.apps.dataset_manager.views import LockNotAvailableError + from cvat.apps.dataset_manager.views import FileIsBeingUsedError with ( patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_TTL", new=EXPORT_CACHE_LOCK_TTL), - patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_TTL - 1), + patch("cvat.apps.dataset_manager.views.EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", new=EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT), patch( "cvat.apps.dataset_manager.views.get_export_cache_lock", new=self.patched_get_export_cache_lock, @@ -1525,7 +1530,7 @@ def _clear(*_, file_path: str, file_ctime: str): clear_export_cache( file_path=file_path, file_ctime=file_ctime, logger=MagicMock() ) - except LockNotAvailableError: + except FileIsBeingUsedError: set_condition(clear_has_been_finished) mock_os_remove.assert_not_called() From 2a2111e594b39d97848beec79554eb9f7c76fdb4 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 14:01:03 +0100 Subject: [PATCH 28/32] Update changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md Co-authored-by: Maxim Zhiltsov --- .../20241218_120228_maria_fix_export_job_lock_releasing.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md b/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md index 75b0455bf024..00b3c9e84c16 100644 --- a/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md +++ b/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md @@ -1,9 +1,6 @@ ### Fixed -- Acquiring a long lock with a worker TTL could result in the lock not being released, - blocking further exports with the same parameters () +- Exporting datasets could start significantly later than expected, both for 1 and several users in the same project/task/job + () - Scheduled RQ jobs could not be restarted due to incorrect RQ job status updating and handling () -- Parallel exports with the same parameters by different users caused - a LockNotAvailableError to be raised for all users except the one - who initiated the export first () From 76928b8d85e560fad3af49ff85d5ea119a962808 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 14:29:42 +0100 Subject: [PATCH 29/32] Fix formatting --- .../20241218_120228_maria_fix_export_job_lock_releasing.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md b/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md index 00b3c9e84c16..28115fe7f6c5 100644 --- a/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md +++ b/changelog.d/20241218_120228_maria_fix_export_job_lock_releasing.md @@ -1,6 +1,6 @@ ### Fixed -- Exporting datasets could start significantly later than expected, both for 1 and several users in the same project/task/job - () +- Exporting datasets could start significantly later than expected, both for 1 + and several users in the same project/task/job () - Scheduled RQ jobs could not be restarted due to incorrect RQ job status updating and handling () From 3c5f9b476fe3d837d8d19961efeeaa21a10f3cd5 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Wed, 18 Dec 2024 19:32:04 +0100 Subject: [PATCH 30/32] DATASET_CACHE_TTL -> EXPORT_CACHE_TTL --- cvat/apps/dataset_manager/default_settings.py | 5 ----- cvat/apps/dataset_manager/views.py | 2 +- cvat/apps/engine/default_settings.py | 12 ++++++++++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py index 10c234a1f356..bd6d6576ecf2 100644 --- a/cvat/apps/dataset_manager/default_settings.py +++ b/cvat/apps/dataset_manager/default_settings.py @@ -1,8 +1,3 @@ # Copyright (C) 2024 CVAT.ai Corporation # # SPDX-License-Identifier: MIT - -import os - -DATASET_CACHE_TTL = int(os.getenv("CVAT_DATASET_CACHE_TTL", 60 * 60 * 24)) -"Base lifetime for cached exported datasets, in seconds" diff --git a/cvat/apps/dataset_manager/views.py b/cvat/apps/dataset_manager/views.py index 7cf64e2ff0fc..52bc9cd15f7a 100644 --- a/cvat/apps/dataset_manager/views.py +++ b/cvat/apps/dataset_manager/views.py @@ -44,7 +44,7 @@ def log_exception(logger: logging.Logger | None = None, exc_info: bool = True): (_MODULE_NAME, current_function_name(2)), exc_info=exc_info) -DEFAULT_CACHE_TTL = timedelta(seconds=settings.DATASET_CACHE_TTL) +DEFAULT_CACHE_TTL = timedelta(seconds=settings.EXPORT_CACHE_TTL) PROJECT_CACHE_TTL = DEFAULT_CACHE_TTL TASK_CACHE_TTL = DEFAULT_CACHE_TTL JOB_CACHE_TTL = DEFAULT_CACHE_TTL diff --git a/cvat/apps/engine/default_settings.py b/cvat/apps/engine/default_settings.py index 4a11d19794f9..9b0d4781f63a 100644 --- a/cvat/apps/engine/default_settings.py +++ b/cvat/apps/engine/default_settings.py @@ -27,6 +27,18 @@ Sets the frequency of checking the readiness of the chunk """ +EXPORT_CACHE_TTL = os.getenv("CVAT_DATASET_CACHE_TTL") +"Base lifetime for cached export files, in seconds" + +if EXPORT_CACHE_TTL is not None: + EXPORT_CACHE_TTL = int(EXPORT_CACHE_TTL) + warnings.warn( + "The CVAT_DATASET_CACHE_TTL is deprecated, use CVAT_EXPORT_CACHE_TTL instead", + DeprecationWarning, + ) +else: + EXPORT_CACHE_TTL = int(os.getenv("CVAT_EXPORT_CACHE_TTL", 60 * 60 * 24)) + default_export_cache_lock_ttl = 30 EXPORT_CACHE_LOCK_TTL = os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL") From 788edb80958b08dd373b351e9b190fc8b3a51d70 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 19 Dec 2024 12:01:22 +0100 Subject: [PATCH 31/32] Remove dataset_manager/default_settings.py --- cvat/apps/dataset_manager/apps.py | 6 ------ cvat/apps/dataset_manager/default_settings.py | 3 --- cvat/apps/engine/default_settings.py | 15 ++++++++++----- 3 files changed, 10 insertions(+), 14 deletions(-) delete mode 100644 cvat/apps/dataset_manager/default_settings.py diff --git a/cvat/apps/dataset_manager/apps.py b/cvat/apps/dataset_manager/apps.py index 3e62d078171c..c3e206c8e5ba 100644 --- a/cvat/apps/dataset_manager/apps.py +++ b/cvat/apps/dataset_manager/apps.py @@ -10,9 +10,3 @@ class DatasetManagerConfig(AppConfig): def ready(self) -> None: from django.conf import settings - - from . import default_settings - - for key in dir(default_settings): - if key.isupper() and not hasattr(settings, key): - setattr(settings, key, getattr(default_settings, key)) diff --git a/cvat/apps/dataset_manager/default_settings.py b/cvat/apps/dataset_manager/default_settings.py deleted file mode 100644 index bd6d6576ecf2..000000000000 --- a/cvat/apps/dataset_manager/default_settings.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (C) 2024 CVAT.ai Corporation -# -# SPDX-License-Identifier: MIT diff --git a/cvat/apps/engine/default_settings.py b/cvat/apps/engine/default_settings.py index 9b0d4781f63a..76c1f30f5ebc 100644 --- a/cvat/apps/engine/default_settings.py +++ b/cvat/apps/engine/default_settings.py @@ -26,6 +26,10 @@ """ Sets the frequency of checking the readiness of the chunk """ +default_export_cache_ttl = 60 * 60 * 24 +default_export_cache_lock_ttl = 30 +default_export_cache_lock_acquisition_timeout = 50 +default_export_locked_retry_interval = 60 EXPORT_CACHE_TTL = os.getenv("CVAT_DATASET_CACHE_TTL") "Base lifetime for cached export files, in seconds" @@ -37,9 +41,8 @@ DeprecationWarning, ) else: - EXPORT_CACHE_TTL = int(os.getenv("CVAT_EXPORT_CACHE_TTL", 60 * 60 * 24)) + EXPORT_CACHE_TTL = int(os.getenv("CVAT_EXPORT_CACHE_TTL", default_export_cache_ttl)) -default_export_cache_lock_ttl = 30 EXPORT_CACHE_LOCK_TTL = os.getenv("CVAT_DATASET_EXPORT_LOCK_TTL") "Default lifetime for the export cache lock, in seconds." @@ -66,10 +69,10 @@ DeprecationWarning, ) else: - default_export_cache_lock_acquire_timeout = 50 EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int( os.getenv( - "CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", default_export_cache_lock_acquire_timeout + "CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT", + default_export_cache_lock_acquisition_timeout, ) ) @@ -87,4 +90,6 @@ DeprecationWarning, ) else: - EXPORT_LOCKED_RETRY_INTERVAL = int(os.getenv("CVAT_EXPORT_LOCKED_RETRY_INTERVAL", 60)) + EXPORT_LOCKED_RETRY_INTERVAL = int( + os.getenv("CVAT_EXPORT_LOCKED_RETRY_INTERVAL", default_export_locked_retry_interval) + ) From 522d5a103c9ad74a025b9e960b0271f166b5e815 Mon Sep 17 00:00:00 2001 From: Maria Khrustaleva Date: Thu, 19 Dec 2024 12:30:50 +0100 Subject: [PATCH 32/32] Fix missing warnings --- cvat/apps/dataset_manager/apps.py | 3 --- cvat/apps/engine/default_settings.py | 16 +++++++--------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/cvat/apps/dataset_manager/apps.py b/cvat/apps/dataset_manager/apps.py index c3e206c8e5ba..2d2a03c51645 100644 --- a/cvat/apps/dataset_manager/apps.py +++ b/cvat/apps/dataset_manager/apps.py @@ -7,6 +7,3 @@ class DatasetManagerConfig(AppConfig): name = "cvat.apps.dataset_manager" - - def ready(self) -> None: - from django.conf import settings diff --git a/cvat/apps/engine/default_settings.py b/cvat/apps/engine/default_settings.py index 76c1f30f5ebc..f853d3bc8219 100644 --- a/cvat/apps/engine/default_settings.py +++ b/cvat/apps/engine/default_settings.py @@ -2,12 +2,14 @@ # # SPDX-License-Identifier: MIT +import logging as log import os -import warnings from attrs.converters import to_bool from django.core.exceptions import ImproperlyConfigured +logger = log.getLogger("cvat") + MEDIA_CACHE_ALLOW_STATIC_CACHE = to_bool(os.getenv("CVAT_ALLOW_STATIC_CACHE", False)) """ Allow or disallow static media cache. @@ -36,9 +38,8 @@ if EXPORT_CACHE_TTL is not None: EXPORT_CACHE_TTL = int(EXPORT_CACHE_TTL) - warnings.warn( + logger.warning( "The CVAT_DATASET_CACHE_TTL is deprecated, use CVAT_EXPORT_CACHE_TTL instead", - DeprecationWarning, ) else: EXPORT_CACHE_TTL = int(os.getenv("CVAT_EXPORT_CACHE_TTL", default_export_cache_ttl)) @@ -49,9 +50,8 @@ if EXPORT_CACHE_LOCK_TTL is not None: EXPORT_CACHE_LOCK_TTL = int(EXPORT_CACHE_LOCK_TTL) - warnings.warn( + logger.warning( "The CVAT_DATASET_EXPORT_LOCK_TTL is deprecated, use CVAT_EXPORT_CACHE_LOCK_TTL instead", - DeprecationWarning, ) else: EXPORT_CACHE_LOCK_TTL = int( @@ -63,10 +63,9 @@ if EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT is not None: EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int(EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT) - warnings.warn( + logger.warning( "The CVAT_DATASET_CACHE_LOCK_TIMEOUT is deprecated, " "use CVAT_EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT instead", - DeprecationWarning, ) else: EXPORT_CACHE_LOCK_ACQUISITION_TIMEOUT = int( @@ -84,10 +83,9 @@ if EXPORT_LOCKED_RETRY_INTERVAL is not None: EXPORT_LOCKED_RETRY_INTERVAL = int(EXPORT_LOCKED_RETRY_INTERVAL) - warnings.warn( + logger.warning( "The CVAT_DATASET_EXPORT_LOCKED_RETRY_INTERVAL is deprecated, " "use CVAT_EXPORT_LOCKED_RETRY_INTERVAL instead", - DeprecationWarning, ) else: EXPORT_LOCKED_RETRY_INTERVAL = int(