Feature/background prune 2 (#2583)

* first cut at redis * some new helper functions for the db * ignore kombu tables in alembic migrations (used by celery) * multiline commands for readability, add vespa_metadata_sync queue to worker * typo fix * fix returning tuple fields * add constants * fix _get_access_for_document * docstrings! * fix double function declaration and typing * fix type hinting * add a global redis pool * Add get_document function * use task_logger in various celery tasks * add celeryconfig.py to simplify configuration. Will be used in a subsequent commit * Add celery redis helper. used in a subsequent PR * kombu warning getting spammy since celery is not self managing its queue in Postgres any more * add last_modified and last_synced to documents * fix task naming convention * use celeryconfig.py * the big one. adds queues and tasks, updates functions to use the queues with priorities, etc * change vespa index log line to debug * mypy fixes * update alembic migration * fix fence ordering, rename to "monitor", fix fetch_versioned_implementation call * mypy * switch to monotonic time * fix startup dependencies on redis * rebase alembic migration * kombu cleanup - fail silently * mypy * add redis_host environment override * update REDIS_HOST env var in docker-compose.dev.yml * update the rest of the docker files * in flight * harden indexing-status endpoint against db changes happening in the background. Needs further improvement but OK for now. * allow no task syncs to run because we create certain objects with no entries but initially marked as out of date * add back writing to vespa on indexing * actually working connector deletion * update contributing guide * backporting fixes from background_deletion * renaming cache to cache_volume * add redis password to various deployments * try setting up pr testing for helm * fix indent * hopefully this release version actually exists * fix command line option to --chart-dirs * fetch-depth 0 * edit values.yaml * try setting ct working directory * bypass testing only on change for now * move files and lint them * update helm testing * some issues suggest using --config works * add vespa repo * add postgresql repo * increase timeout * try amd64 runner * fix redis password reference * add comment to helm chart testing workflow * rename helm testing workflow to disable it * adding clarifying comments * address code review * missed a file * remove commented warning ... just not needed * fix imports * refactor to use update_single * mypy fixes * add vespa test * multiple celery workers * update logs as well and set prefetch multipliers appropriate to the worker intent * add db refresh to connector deletion * add some preliminary locking * organize tasks into separate files * celery auto associates tasks created inside another task, which bloats the result metadata considerably. trail=False prevents this. * code review fixes * move monitor_usergroup_taskset to ee, improve logging * add multi workers to dev_run_background_jobs.py * update supervisord with some recommended settings for celery * name celery workers and shorten dev script prefixing * add configurable sql alchemy engine settings on startup (needed for various intents like API server, different celery workers and tasks, etc) * fix comments * autoscale sqlalchemy pool size to celery concurrency (allow override later?) * supervisord needs the percent symbols escaped * use name as primary check, some minor refactoring and type hinting too. * stash merge (may not function yet) * remove dead code * more cleanup * remove dead file * we shouldn't be checking for deletion attempts in the db any more * print cc_pair_id * print status on status mismatch again * add logging when cc_pair isn't present * don't indexing any ingestion type connectors, and don't pause any connectors that aren't active * add more specific check for deletion completion * remove flaky mediawiki test site * move is_pruning * remove unused code * remove old function --------- Co-authored-by: Richard Kuo <[email protected]>
onyx-dot-app · Oct 7, 2024 · 3404c7e · 3404c7e
1 parent 64909d7
commit 3404c7e
Show file tree

Hide file tree

Showing 23 changed files with 649 additions and 460 deletions.
diff --git a/backend/alembic/versions/ac5eaac849f9_add_last_pruned_to_connector_table.py b/backend/alembic/versions/ac5eaac849f9_add_last_pruned_to_connector_table.py
@@ -0,0 +1,27 @@
+"""add last_pruned to the connector_credential_pair table
+
+Revision ID: ac5eaac849f9
+Revises: 52a219fb5233
+Create Date: 2024-09-10 15:04:26.437118
+
+"""
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "ac5eaac849f9"
+down_revision = "46b7a812670f"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # last pruned represents the last time the connector was pruned
+    op.add_column(
+        "connector_credential_pair",
+        sa.Column("last_pruned", sa.DateTime(timezone=True), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("connector_credential_pair", "last_pruned")
diff --git a/backend/danswer/background/celery/celery_app.py b/backend/danswer/background/celery/celery_app.py
@@ -19,6 +19,7 @@
 
 from danswer.background.celery.celery_redis import RedisConnectorCredentialPair
 from danswer.background.celery.celery_redis import RedisConnectorDeletion
+from danswer.background.celery.celery_redis import RedisConnectorPruning
 from danswer.background.celery.celery_redis import RedisDocumentSet
 from danswer.background.celery.celery_redis import RedisUserGroup
 from danswer.background.celery.celery_utils import celery_is_worker_primary
@@ -104,6 +105,13 @@ def celery_task_postrun(
             r.srem(rcd.taskset_key, task_id)
         return
 
+    if task_id.startswith(RedisConnectorPruning.SUBTASK_PREFIX):
+        cc_pair_id = RedisConnectorPruning.get_id_from_task_id(task_id)
+        if cc_pair_id is not None:
+            rcp = RedisConnectorPruning(cc_pair_id)
+            r.srem(rcp.taskset_key, task_id)
+        return
+
 
 @beat_init.connect
 def on_beat_init(sender: Any, **kwargs: Any) -> None:
@@ -236,6 +244,18 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None:
     for key in r.scan_iter(RedisConnectorDeletion.FENCE_PREFIX + "*"):
         r.delete(key)
 
+    for key in r.scan_iter(RedisConnectorPruning.TASKSET_PREFIX + "*"):
+        r.delete(key)
+
+    for key in r.scan_iter(RedisConnectorPruning.GENERATOR_COMPLETE_PREFIX + "*"):
+        r.delete(key)
+
+    for key in r.scan_iter(RedisConnectorPruning.GENERATOR_PROGRESS_PREFIX + "*"):
+        r.delete(key)
+
+    for key in r.scan_iter(RedisConnectorPruning.FENCE_PREFIX + "*"):
+        r.delete(key)
+
 
 @worker_ready.connect
 def on_worker_ready(sender: Any, **kwargs: Any) -> None:
@@ -330,7 +350,11 @@ def on_setup_logging(
 
 class HubPeriodicTask(bootsteps.StartStopStep):
     """Regularly reacquires the primary worker lock outside of the task queue.
-    Use the task_logger in this class to avoid double logging."""
+    Use the task_logger in this class to avoid double logging.
+
+    This cannot be done inside a regular beat task because it must run on schedule and
+    a queue of existing work would starve the task from running.
+    """
 
     # it's unclear to me whether using the hub's timer or the bootstep timer is better
     requires = {"celery.worker.components:Hub"}
@@ -405,6 +429,7 @@ def stop(self, worker: Any) -> None:
         "danswer.background.celery.tasks.connector_deletion",
         "danswer.background.celery.tasks.periodic",
         "danswer.background.celery.tasks.pruning",
+        "danswer.background.celery.tasks.shared",
         "danswer.background.celery.tasks.vespa",
     ]
 )
@@ -425,16 +450,16 @@ def stop(self, worker: Any) -> None:
             "task": "check_for_connector_deletion_task",
             # don't need to check too often, since we kick off a deletion initially
             # during the API call that actually marks the CC pair for deletion
-            "schedule": timedelta(minutes=1),
+            "schedule": timedelta(seconds=60),
             "options": {"priority": DanswerCeleryPriority.HIGH},
         },
     }
 )
 celery_app.conf.beat_schedule.update(
     {
         "check-for-prune": {
-            "task": "check_for_prune_task",
-            "schedule": timedelta(seconds=5),
+            "task": "check_for_prune_task_2",
+            "schedule": timedelta(seconds=60),
             "options": {"priority": DanswerCeleryPriority.HIGH},
         },
     }

diff --git a/backend/danswer/background/celery/celery_redis.py b/backend/danswer/background/celery/celery_redis.py
@@ -343,6 +343,125 @@ def generate_tasks(
         return len(async_results)
 
 
+class RedisConnectorPruning(RedisObjectHelper):
+    """Celery will kick off a long running generator task to crawl the connector and
+    find any missing docs, which will each then get a new cleanup task. The progress of
+    those tasks will then be monitored to completion.
+
+    Example rough happy path order:
+    Check connectorpruning_fence_1
+    Send generator task with id connectorpruning+generator_1_{uuid}
+
+    generator runs connector with callbacks that increment connectorpruning_generator_progress_1
+    generator creates many subtasks with id connectorpruning+sub_1_{uuid}
+      in taskset connectorpruning_taskset_1
+    on completion, generator sets connectorpruning_generator_complete_1
+
+    celery postrun removes subtasks from taskset
+    monitor beat task cleans up when taskset reaches 0 items
+    """
+
+    PREFIX = "connectorpruning"
+    FENCE_PREFIX = PREFIX + "_fence"  # a fence for the entire pruning process
+    GENERATOR_TASK_PREFIX = PREFIX + "+generator"
+
+    TASKSET_PREFIX = PREFIX + "_taskset"  # stores a list of prune tasks id's
+    SUBTASK_PREFIX = PREFIX + "+sub"
+
+    GENERATOR_PROGRESS_PREFIX = (
+        PREFIX + "_generator_progress"
+    )  # a signal that contains generator progress
+    GENERATOR_COMPLETE_PREFIX = (
+        PREFIX + "_generator_complete"
+    )  # a signal that the generator has finished
+
+    def __init__(self, id: int) -> None:
+        """id: the cc_pair_id of the connector credential pair"""
+
+        super().__init__(id)
+        self.documents_to_prune: set[str] = set()
+
+    @property
+    def generator_task_id_prefix(self) -> str:
+        return f"{self.GENERATOR_TASK_PREFIX}_{self._id}"
+
+    @property
+    def generator_progress_key(self) -> str:
+        # example: connectorpruning_generator_progress_1
+        return f"{self.GENERATOR_PROGRESS_PREFIX}_{self._id}"
+
+    @property
+    def generator_complete_key(self) -> str:
+        # example: connectorpruning_generator_complete_1
+        return f"{self.GENERATOR_COMPLETE_PREFIX}_{self._id}"
+
+    @property
+    def subtask_id_prefix(self) -> str:
+        return f"{self.SUBTASK_PREFIX}_{self._id}"
+
+    def generate_tasks(
+        self,
+        celery_app: Celery,
+        db_session: Session,
+        redis_client: Redis,
+        lock: redis.lock.Lock | None,
+    ) -> int | None:
+        last_lock_time = time.monotonic()
+
+        async_results = []
+        cc_pair = get_connector_credential_pair_from_id(self._id, db_session)
+        if not cc_pair:
+            return None
+
+        for doc_id in self.documents_to_prune:
+            current_time = time.monotonic()
+            if lock and current_time - last_lock_time >= (
+                CELERY_VESPA_SYNC_BEAT_LOCK_TIMEOUT / 4
+            ):
+                lock.reacquire()
+                last_lock_time = current_time
+
+            # celery's default task id format is "dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # the actual redis key is "celery-task-meta-dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            # we prefix the task id so it's easier to keep track of who created the task
+            # aka "documentset_1_6dd32ded3-00aa-4884-8b21-42f8332e7fac"
+            custom_task_id = f"{self.subtask_id_prefix}_{uuid4()}"
+
+            # add to the tracking taskset in redis BEFORE creating the celery task.
+            # note that for the moment we are using a single taskset key, not differentiated by cc_pair id
+            redis_client.sadd(self.taskset_key, custom_task_id)
+
+            # Priority on sync's triggered by new indexing should be medium
+            result = celery_app.send_task(
+                "document_by_cc_pair_cleanup_task",
+                kwargs=dict(
+                    document_id=doc_id,
+                    connector_id=cc_pair.connector_id,
+                    credential_id=cc_pair.credential_id,
+                ),
+                queue=DanswerCeleryQueues.CONNECTOR_DELETION,
+                task_id=custom_task_id,
+                priority=DanswerCeleryPriority.MEDIUM,
+            )
+
+            async_results.append(result)
+
+        return len(async_results)
+
+    def is_pruning(self, db_session: Session, redis_client: Redis) -> bool:
+        """A single example of a helper method being refactored into the redis helper"""
+        cc_pair = get_connector_credential_pair_from_id(
+            cc_pair_id=self._id, db_session=db_session
+        )
+        if not cc_pair:
+            raise ValueError(f"cc_pair_id {self._id} does not exist.")
+
+        if redis_client.exists(self.fence_key):
+            return True
+
+        return False
+
+
 def celery_get_queue_length(queue: str, r: Redis) -> int:
     """This is a redis specific way to get the length of a celery queue.
     It is priority aware and knows how to count across the multiple redis lists

diff --git a/backend/danswer/background/celery/celery_utils.py b/backend/danswer/background/celery/celery_utils.py
@@ -1,12 +1,11 @@
+from collections.abc import Callable
 from datetime import datetime
 from datetime import timezone
 from typing import Any
 
 from sqlalchemy.orm import Session
 
 from danswer.background.celery.celery_redis import RedisConnectorDeletion
-from danswer.background.task_utils import name_cc_prune_task
-from danswer.configs.app_configs import ALLOW_SIMULTANEOUS_PRUNING
 from danswer.configs.app_configs import MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE
 from danswer.connectors.cross_connector_utils.rate_limit_wrapper import (
     rate_limit_builder,
@@ -17,14 +16,8 @@
 from danswer.connectors.interfaces import PollConnector
 from danswer.connectors.models import Document
 from danswer.db.connector_credential_pair import get_connector_credential_pair
-from danswer.db.engine import get_db_current_time
 from danswer.db.enums import TaskStatus
-from danswer.db.models import Connector
-from danswer.db.models import Credential
 from danswer.db.models import TaskQueueState
-from danswer.db.tasks import check_task_is_live_and_not_timed_out
-from danswer.db.tasks import get_latest_task
-from danswer.db.tasks import get_latest_task_by_type
 from danswer.redis.redis_pool import get_redis_client
 from danswer.server.documents.models import DeletionAttemptSnapshot
 from danswer.utils.logger import setup_logger
@@ -70,72 +63,19 @@ def get_deletion_attempt_snapshot(
     )
 
 
-def skip_cc_pair_pruning_by_task(
-    pruning_task: TaskQueueState | None, db_session: Session
-) -> bool:
-    """task should be the latest prune task for this cc_pair"""
-    if not ALLOW_SIMULTANEOUS_PRUNING:
-        # if only one prune is allowed at any time, then check to see if any prune
-        # is active
-        pruning_type_task_name = name_cc_prune_task()
-        last_pruning_type_task = get_latest_task_by_type(
-            pruning_type_task_name, db_session
-        )
-
-        if last_pruning_type_task and check_task_is_live_and_not_timed_out(
-            last_pruning_type_task, db_session
-        ):
-            return True
-
-    if pruning_task and check_task_is_live_and_not_timed_out(pruning_task, db_session):
-        # if the last task is live right now, we shouldn't start a new one
-        return True
-
-    return False
-
-
-def should_prune_cc_pair(
-    connector: Connector, credential: Credential, db_session: Session
-) -> bool:
-    if not connector.prune_freq:
-        return False
-
-    pruning_task_name = name_cc_prune_task(
-        connector_id=connector.id, credential_id=credential.id
-    )
-    last_pruning_task = get_latest_task(pruning_task_name, db_session)
-
-    if skip_cc_pair_pruning_by_task(last_pruning_task, db_session):
-        return False
-
-    current_db_time = get_db_current_time(db_session)
-
-    if not last_pruning_task:
-        # If the connector has never been pruned, then compare vs when the connector
-        # was created
-        time_since_initialization = current_db_time - connector.time_created
-        if time_since_initialization.total_seconds() >= connector.prune_freq:
-            return True
-        return False
-
-    if not last_pruning_task.start_time:
-        # if the last prune task hasn't started, we shouldn't start a new one
-        return False
-
-    # if the last prune task has a start time, then compare against it to determine
-    # if we should start
-    time_since_last_pruning = current_db_time - last_pruning_task.start_time
-    return time_since_last_pruning.total_seconds() >= connector.prune_freq
-
-
 def document_batch_to_ids(doc_batch: list[Document]) -> set[str]:
     return {doc.id for doc in doc_batch}
 
 
-def extract_ids_from_runnable_connector(runnable_connector: BaseConnector) -> set[str]:
+def extract_ids_from_runnable_connector(
+    runnable_connector: BaseConnector,
+    progress_callback: Callable[[int], None] | None = None,
+) -> set[str]:
     """
     If the PruneConnector hasnt been implemented for the given connector, just pull
-    all docs using the load_from_state and grab out the IDs
+    all docs using the load_from_state and grab out the IDs.
+
+    Optionally, a callback can be passed to handle the length of each document batch.
     """
     all_connector_doc_ids: set[str] = set()
 
@@ -158,6 +98,8 @@ def extract_ids_from_runnable_connector(runnable_connector: BaseConnector) -> se
                 max_calls=MAX_PRUNING_DOCUMENT_RETRIEVAL_PER_MINUTE, period=60
             )(document_batch_to_ids)
         for doc_batch in doc_batch_generator:
+            if progress_callback:
+                progress_callback(len(doc_batch))
             all_connector_doc_ids.update(doc_batch_processing_func(doc_batch))
 
     return all_connector_doc_ids
@@ -177,9 +119,10 @@ def celery_is_listening_to_queue(worker: Any, name: str) -> bool:
 
 
 def celery_is_worker_primary(worker: Any) -> bool:
-    """There are multiple approaches that could be taken, but the way we do it is to
-    check the hostname set for the celery worker, either in celeryconfig.py or on the
-    command line."""
+    """There are multiple approaches that could be taken to determine if a celery worker
+    is 'primary', as defined by us. But the way we do it is to check the hostname set
+    for the celery worker, which can be done either in celeryconfig.py or on the
+    command line with '--hostname'."""
     hostname = worker.hostname
     if hostname.startswith("light"):
         return False