Merge pull request #900 from endlessm/skip-content-import

Skip content import
endlessm · Nov 15, 2023 · 1a02673 · 1a02673
2 parents 17cb0cd + ffaa29c
commit 1a02673
Show file tree

Hide file tree

Showing 7 changed files with 389 additions and 66 deletions.
diff --git a/.flake8 b/.flake8
@@ -1,5 +1,6 @@
 [flake8]
 extend-exclude =
+  kolibri_explore_plugin/_version.py,
   build/,
   static/,
   dist/,

diff --git a/kolibri_explore_plugin/collectionviews.py b/kolibri_explore_plugin/collectionviews.py
@@ -12,6 +12,9 @@
 from kolibri.core.content.utils.content_manifest import (
     ContentManifestParseError,
 )
+from kolibri.core.content.utils.import_export_content import (
+    get_import_export_data,
+)
 from kolibri.core.tasks.job import State as JobState
 from kolibri.core.tasks.main import job_storage
 from kolibri.utils import conf
@@ -26,6 +29,7 @@
 from .jobs import get_channel_metadata
 from .jobs import get_remotechannelimport_task
 from .jobs import get_remotecontentimport_task
+from .jobs import get_remoteimport_task
 from .models import BackgroundTask
 
 logger = logging.getLogger(__name__)
@@ -163,18 +167,37 @@ def get_channelimport_tasks(self):
     def get_extra_channelimport_tasks(self):
         """Return a serializable object to create extra channelimport tasks
 
-        For all channels featured in Endless Key content manifests.
+        For all channels featured in Endless Key content manifests. In addition
+        to the channel metadata, all thumbnails are downloaded.
         """
         tasks = []
         for channel_id, channel_version in self.get_latest_extra_channels():
+            # Check if the channel metadata and thumbnails are already
+            # available.
             metadata = get_channel_metadata(channel_id)
             if metadata and metadata.version >= channel_version:
-                logger.debug(
-                    f"Skipping extra channel import task for {channel_id} "
-                    "since already present"
+                # The channel metadata is available. Now check if the thumbnail
+                # nodes are already available.
+                num_resources, _, _ = get_import_export_data(
+                    channel_id,
+                    node_ids=[],
+                    available=False,
+                    all_thumbnails=True,
                 )
-                continue
-            tasks.append(get_remotechannelimport_task(channel_id))
+                if num_resources == 0:
+                    logger.debug(
+                        f"Skipping extra channel import task for {channel_id} "
+                        "since channel metadata and all resources already "
+                        "present"
+                    )
+                    continue
+
+            tasks.append(
+                get_remoteimport_task(
+                    channel_id, node_ids=[], all_thumbnails=True
+                )
+            )
+
         return tasks
 
     def get_contentimport_tasks(self):
@@ -189,6 +212,20 @@ def get_contentimport_tasks(self):
             node_ids = list(
                 self._get_node_ids_for_channel(channel_metadata, channel_id)
             )
+
+            # Check if the desired nodes are already available.
+            num_resources, _, _ = get_import_export_data(
+                channel_id,
+                node_ids=node_ids,
+                available=False,
+            )
+            if num_resources == 0:
+                logger.debug(
+                    f"Skipping content import task for {channel_id} "
+                    "since all resources already present"
+                )
+                continue
+
             tasks.append(
                 get_remotecontentimport_task(
                     channel_id, channel_metadata.name, node_ids
@@ -219,12 +256,30 @@ def get_contentthumbnail_tasks(self):
 
         For all the channels in this content manifest.
         """
-        return [
-            get_remotecontentimport_task(
-                channel_id, node_ids=[], all_thumbnails=True
+        tasks = []
+
+        for channel_id in self.get_channel_ids():
+            # Check if the desired thumbnail nodes are already available.
+            num_resources, _, _ = get_import_export_data(
+                channel_id,
+                node_ids=[],
+                available=False,
+                all_thumbnails=True,
             )
-            for channel_id in self.get_channel_ids()
-        ]
+            if num_resources == 0:
+                logger.debug(
+                    f"Skipping content thumbnail task for {channel_id} "
+                    "since all resources already present"
+                )
+                continue
+
+            tasks.append(
+                get_remotecontentimport_task(
+                    channel_id, node_ids=[], all_thumbnails=True
+                )
+            )
+
+        return tasks
 
     def _get_node_ids_for_channel(self, channel_metadata, channel_id):
         """Get node IDs regardless of the version

diff --git a/kolibri_explore_plugin/jobs.py b/kolibri_explore_plugin/jobs.py
@@ -27,6 +27,7 @@ class TaskType:
     APPLYEXTERNALTAGS = "kolibri_explore_plugin.tasks.applyexternaltags"
     REMOTECHANNELIMPORT = "kolibri.core.content.tasks.remotechannelimport"
     REMOTECONTENTIMPORT = "kolibri_explore_plugin.tasks.remotecontentimport"
+    REMOTEIMPORT = "kolibri.core.content.tasks.remoteimport"
 
 
 def get_channel_metadata(channel_id):
@@ -89,6 +90,33 @@ def get_remotecontentimport_task(
     }
 
 
+def get_remoteimport_task(
+    channel_id,
+    channel_name=None,
+    node_ids=None,
+    all_thumbnails=False,
+):
+    if not channel_name:
+        # Try to get the channel name from an existing channel database,
+        # but this will fail on first import.
+        channel_metadata = get_channel_metadata(channel_id)
+        if channel_metadata:
+            channel_name = channel_metadata.name
+        else:
+            channel_name = "unknown"
+    return {
+        "task": TaskType.REMOTEIMPORT,
+        "params": {
+            "channel_id": channel_id,
+            "channel_name": channel_name,
+            "node_ids": node_ids,
+            "exclude_node_ids": [],
+            "all_thumbnails": all_thumbnails,
+            "fail_on_error": True,
+        },
+    }
+
+
 def get_content_task_user():
     """Get a Kolibri user for content task usage"""
     return FacilityUser.objects.filter(
@@ -178,9 +206,23 @@ def storage_update_hook(job, orm_job, state=None, **kwargs):
     elif state == State.COMPLETED:
         # If the completed task is a channel import, create the
         # associated thumbnail download task to be run later.
+        #
+        # FIXME: Previously the extra channels and their thumbnails were
+        # imported using 2 tasks. In order to keep the thumbnail task from
+        # being created before the channel was imported, the thumbnail task was
+        # created on the fly here. Now this is done with a single combined
+        # remoteimport task and this is no longer needed. However, it's kept
+        # for now in case there are existing installations that had started the
+        # background tasks but not completed them. Drop this at some point.
+        #
+        # https://github.com/endlessm/kolibri-explore-plugin/issues/890
         if bg_task.func == TaskType.REMOTECHANNELIMPORT:
             bg_task_params = json.loads(bg_task.params)
             channel_id = bg_task_params["channel_id"]
+            logger.warning(
+                f"Creating thumbnail task for {channel_id} legacy background "
+                "channel import task"
+            )
             thumbnail_task_data = get_remotecontentimport_task(
                 channel_id, node_ids=[], all_thumbnails=True
             )

diff --git a/kolibri_explore_plugin/test/conftest.py b/kolibri_explore_plugin/test/conftest.py
@@ -77,9 +77,6 @@ def contentdir(serverdir):
 @pytest.fixture
 def content_server(serverdir, contentdir, monkeypatch):
     """HTTP content server using test data"""
-    from kolibri.core.discovery.utils.network.client import NetworkClient
-    from kolibri.core.content.utils import resource_import
-
     with ContentServer(serverdir) as server:
         # Override the Kolibri content server URL.
         monkeypatch.setitem(
@@ -88,18 +85,6 @@ def content_server(serverdir, contentdir, monkeypatch):
             server.url,
         )
 
-        # Don't introspect the server for info.
-        monkeypatch.setattr(
-            NetworkClient,
-            "build_for_address",
-            lambda addr: NetworkClient(addr),
-        )
-        monkeypatch.setattr(
-            resource_import,
-            "lookup_channel_listing_status",
-            lambda channel_id, baseurl: None,
-        )
-
         yield server
 
 

diff --git a/kolibri_explore_plugin/test/test_collectionviews.py b/kolibri_explore_plugin/test/test_collectionviews.py
@@ -9,12 +9,14 @@
 from kolibri.core.content.models import ChannelMetadata
 from kolibri.core.content.models import ContentNode
 from kolibri.core.content.models import LocalFile
+from kolibri.core.tasks import main as tasks_main
 from rest_framework.test import APIClient
 
 from .utils import COLLECTIONSDIR
 from .utils import ExploreTestTimeoutError
 from .utils import wait_for_background_tasks
 from kolibri_explore_plugin import collectionviews
+from kolibri_explore_plugin.jobs import TaskType
 
 
 @pytest.mark.django_db
@@ -220,6 +222,11 @@ def test_download_manager_preload(facility_user, grade, name):
     assert num_initial_channels == len(all_channels)
     assert LocalFile.objects.filter(available=False).count() == 0
 
+    # Clear all the jobs to check if any downloading jobs were created
+    # later.
+    job_storage = tasks_main.job_storage
+    job_storage.clear(force=True)
+
     # Run the downloader with requests blocked. Since no URLs are mocked, all
     # requests will fail. Since the download manager retries tasks forever, it
     # will eventually time out on any request.
@@ -233,3 +240,11 @@ def test_download_manager_preload(facility_user, grade, name):
     assert (
         LocalFile.objects.filter(available=True).count() == num_initial_files
     )
+
+    # Check that no channel or content import jobs were created.
+    channel_jobs = job_storage.filter_jobs(func=TaskType.REMOTECHANNELIMPORT)
+    assert channel_jobs == []
+    content_jobs = job_storage.filter_jobs(func=TaskType.REMOTECONTENTIMPORT)
+    assert content_jobs == []
+    import_jobs = job_storage.filter_jobs(func=TaskType.REMOTEIMPORT)
+    assert import_jobs == []
diff --git a/kolibri_explore_plugin/test/test_jobs.py b/kolibri_explore_plugin/test/test_jobs.py
@@ -132,6 +132,89 @@ def test_get_remotecontentimport_task():
     }
 
 
+@pytest.mark.usefixtures("channel_import_db", "content_server")
+@pytest.mark.django_db
+def test_get_remoteimport_task():
+    channel_id = "b51baf46133045e3bce4d2d872a8f71d"
+    node_ids = [
+        "5a24503255ce43d98ebcb25d2b60f024",
+        "91a1bfc0ede544979f861909b7862537",
+    ]
+
+    # No nodes specified.
+    task = jobs.get_remoteimport_task(channel_id)
+    assert task == {
+        "task": jobs.TaskType.REMOTEIMPORT,
+        "params": {
+            "channel_id": channel_id,
+            "channel_name": "unknown",
+            "node_ids": None,
+            "exclude_node_ids": [],
+            "all_thumbnails": False,
+            "fail_on_error": True,
+        },
+    }
+
+    # Specify the nodes.
+    task = jobs.get_remoteimport_task(channel_id, node_ids=node_ids)
+    assert task == {
+        "task": jobs.TaskType.REMOTEIMPORT,
+        "params": {
+            "channel_id": channel_id,
+            "channel_name": "unknown",
+            "node_ids": node_ids,
+            "exclude_node_ids": [],
+            "all_thumbnails": False,
+            "fail_on_error": True,
+        },
+    }
+
+    # Override the channel name.
+    task = jobs.get_remoteimport_task(channel_id, channel_name="foo")
+    assert task == {
+        "task": jobs.TaskType.REMOTEIMPORT,
+        "params": {
+            "channel_id": channel_id,
+            "channel_name": "foo",
+            "node_ids": None,
+            "exclude_node_ids": [],
+            "all_thumbnails": False,
+            "fail_on_error": True,
+        },
+    }
+
+    # Specify an empty node list and all_thumbnails.
+    task = jobs.get_remoteimport_task(
+        channel_id, node_ids=[], all_thumbnails=True
+    )
+    assert task == {
+        "task": jobs.TaskType.REMOTEIMPORT,
+        "params": {
+            "channel_id": channel_id,
+            "channel_name": "unknown",
+            "node_ids": [],
+            "exclude_node_ids": [],
+            "all_thumbnails": True,
+            "fail_on_error": True,
+        },
+    }
+
+    # After importing the channel, the channel name will be known.
+    importchannel(channel_id)
+    task = jobs.get_remoteimport_task(channel_id)
+    assert task == {
+        "task": jobs.TaskType.REMOTEIMPORT,
+        "params": {
+            "channel_id": channel_id,
+            "channel_name": "testing",
+            "node_ids": None,
+            "exclude_node_ids": [],
+            "all_thumbnails": False,
+            "fail_on_error": True,
+        },
+    }
+
+
 @pytest.mark.usefixtures(
     "channel_import_db", "content_server", "facility_user", "worker"
 )