From c65478e487e98042af99bbab3243efb9abfaad74 Mon Sep 17 00:00:00 2001 From: Navin Karkera Date: Tue, 27 Aug 2024 23:15:46 +0530 Subject: [PATCH] feat: index library collections in studio meilisearch index (#35324) --- openedx/core/djangoapps/content/search/api.py | 95 ++++++++++++++----- .../djangoapps/content/search/documents.py | 38 ++++++++ .../content/search/tests/test_api.py | 42 +++++++- .../content/search/tests/test_documents.py | 35 ++++++- requirements/constraints.txt | 2 +- requirements/edx/base.txt | 2 +- requirements/edx/development.txt | 2 +- requirements/edx/doc.txt | 2 +- requirements/edx/testing.txt | 2 +- 9 files changed, 189 insertions(+), 31 deletions(-) diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index f59b9de37132..9fb49b24b6d1 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -19,6 +19,7 @@ from meilisearch.models.task import TaskInfo from opaque_keys.edx.keys import UsageKey from opaque_keys.edx.locator import LibraryLocatorV2 +from openedx_learning.api import authoring as authoring_api from common.djangoapps.student.roles import GlobalStaff from rest_framework.request import Request from common.djangoapps.student.role_helpers import get_course_roles @@ -31,8 +32,9 @@ Fields, meili_id_from_opaque_key, searchable_doc_for_course_block, + searchable_doc_for_collection, searchable_doc_for_library_block, - searchable_doc_tags + searchable_doc_tags, ) log = logging.getLogger(__name__) @@ -294,12 +296,16 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: status_cb("Counting courses...") num_courses = CourseOverview.objects.count() + # Get the list of collections + status_cb("Counting collections...") + num_collections = authoring_api.get_collections().count() + # Some counters so we can track our progress as indexing progresses: - num_contexts = num_courses + num_libraries + num_contexts = num_courses + num_libraries + num_collections num_contexts_done = 0 # How many courses/libraries we've indexed num_blocks_done = 0 # How many individual components/XBlocks we've indexed - status_cb(f"Found {num_courses} courses and {num_libraries} libraries.") + status_cb(f"Found {num_courses} courses, {num_libraries} libraries and {num_collections} collections.") with _using_temp_index(status_cb) as temp_index_name: ############## Configure the index ############## @@ -332,6 +338,7 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: Fields.block_id, Fields.content, Fields.tags, + Fields.description, # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they # are searchable only if at least one document in the index has a value. If we didn't list them here and, # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for @@ -363,8 +370,8 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: ############## Libraries ############## status_cb("Indexing libraries...") - for lib_key in lib_keys: - status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}") + + def index_library(lib_key: str) -> list: docs = [] for component in lib_api.get_library_components(lib_key): try: @@ -375,48 +382,88 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: docs.append(doc) except Exception as err: # pylint: disable=broad-except status_cb(f"Error indexing library component {component}: {err}") - finally: - num_blocks_done += 1 if docs: try: # Add all the docs in this library at once (usually faster than adding one at a time): _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) except (TypeError, KeyError, MeilisearchError) as err: status_cb(f"Error indexing library {lib_key}: {err}") + return docs + for lib_key in lib_keys: + status_cb(f"{num_contexts_done + 1}/{num_contexts}. Now indexing library {lib_key}") + lib_docs = index_library(lib_key) + num_blocks_done += len(lib_docs) num_contexts_done += 1 ############## Courses ############## status_cb("Indexing courses...") # To reduce memory usage on large instances, split up the CourseOverviews into pages of 1,000 courses: + + def index_course(course: CourseOverview) -> list: + docs = [] + # Pre-fetch the course with all of its children: + course = store.get_course(course.id, depth=None) + + def add_with_children(block): + """ Recursively index the given XBlock/component """ + doc = searchable_doc_for_course_block(block) + doc.update(searchable_doc_tags(block.usage_key)) + docs.append(doc) # pylint: disable=cell-var-from-loop + _recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop + + # Index course children + _recurse_children(course, add_with_children) + + if docs: + # Add all the docs in this course at once (usually faster than adding one at a time): + _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + return docs + paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000) for p in paginator.page_range: for course in paginator.page(p).object_list: status_cb( f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" ) - docs = [] - - # Pre-fetch the course with all of its children: - course = store.get_course(course.id, depth=None) + course_docs = index_course(course) + num_contexts_done += 1 + num_blocks_done += len(course_docs) - def add_with_children(block): - """ Recursively index the given XBlock/component """ - doc = searchable_doc_for_course_block(block) - doc.update(searchable_doc_tags(block.usage_key)) - docs.append(doc) # pylint: disable=cell-var-from-loop - _recurse_children(block, add_with_children) # pylint: disable=cell-var-from-loop + ############## Collections ############## + status_cb("Indexing collections...") - # Index course children - _recurse_children(course, add_with_children) + def index_collection_batch(batch, num_contexts_done) -> int: + docs = [] + for collection in batch: + status_cb( + f"{num_contexts_done + 1}/{num_contexts}. " + f"Now indexing collection {collection.title} ({collection.id})" + ) + try: + doc = searchable_doc_for_collection(collection) + # Uncomment below line once collections are tagged. + # doc.update(searchable_doc_tags(collection.id)) + docs.append(doc) + except Exception as err: # pylint: disable=broad-except + status_cb(f"Error indexing collection {collection}: {err}") + finally: + num_contexts_done += 1 - if docs: - # Add all the docs in this course at once (usually faster than adding one at a time): + if docs: + try: + # Add docs in batch of 100 at once (usually faster than adding one at a time): _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) - num_contexts_done += 1 - num_blocks_done += len(docs) + except (TypeError, KeyError, MeilisearchError) as err: + status_cb(f"Error indexing collection batch {p}: {err}") + return num_contexts_done + + # To reduce memory usage on large instances, split up the Collections into pages of 100 collections: + paginator = Paginator(authoring_api.get_collections(enabled=True), 100) + for p in paginator.page_range: + num_contexts_done = index_collection_batch(paginator.page(p).object_list, num_contexts_done) - status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses and libraries.") + status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.") def upsert_xblock_index_doc(usage_key: UsageKey, recursive: bool = True) -> None: diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index 57720e54d05e..a45b37ab2ad2 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -13,6 +13,7 @@ from openedx.core.djangoapps.content_libraries import api as lib_api from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangoapps.xblock import api as xblock_api +from openedx_learning.api.authoring_models import LearningPackage log = logging.getLogger(__name__) @@ -27,6 +28,7 @@ class Fields: type = "type" # DocType.course_block or DocType.library_block (see below) block_id = "block_id" # The block_id part of the usage key. Sometimes human-readable, sometimes a random hex ID display_name = "display_name" + description = "description" modified = "modified" created = "created" last_published = "last_published" @@ -66,6 +68,7 @@ class DocType: """ course_block = "course_block" library_block = "library_block" + collection = "collection" def meili_id_from_opaque_key(usage_key: UsageKey) -> str: @@ -276,3 +279,38 @@ def searchable_doc_for_course_block(block) -> dict: doc.update(_fields_from_block(block)) return doc + + +def searchable_doc_for_collection(collection) -> dict: + """ + Generate a dictionary document suitable for ingestion into a search engine + like Meilisearch or Elasticsearch, so that the given collection can be + found using faceted search. + """ + doc = { + Fields.id: collection.id, + Fields.type: DocType.collection, + Fields.display_name: collection.title, + Fields.description: collection.description, + Fields.created: collection.created.timestamp(), + Fields.modified: collection.modified.timestamp(), + # Add related learning_package.key as context_key by default. + # If related contentlibrary is found, it will override this value below. + # Mostly contentlibrary.library_key == learning_package.key + Fields.context_key: collection.learning_package.key, + } + # Just in case learning_package is not related to a library + try: + context_key = collection.learning_package.contentlibrary.library_key + org = str(context_key.org) + doc.update({ + Fields.context_key: str(context_key), + Fields.org: org, + }) + except LearningPackage.contentlibrary.RelatedObjectDoesNotExist: + log.warning(f"Related library not found for {collection}") + doc[Fields.access_id] = _meili_access_id_from_context_key(doc[Fields.context_key]) + # Add the breadcrumbs. + doc[Fields.breadcrumbs] = [{"display_name": collection.learning_package.title}] + + return doc diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py index e8616cee60a8..549817700370 100644 --- a/openedx/core/djangoapps/content/search/tests/test_api.py +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -6,12 +6,13 @@ import copy from datetime import datetime, timezone -from unittest.mock import MagicMock, call, patch +from unittest.mock import MagicMock, Mock, call, patch from opaque_keys.edx.keys import UsageKey import ddt from django.test import override_settings from freezegun import freeze_time +from openedx_learning.api import authoring as authoring_api from organizations.tests.factories import OrganizationFactory from common.djangoapps.student.tests.factories import UserFactory @@ -174,6 +175,28 @@ def setUp(self): tagging_api.add_tag_to_taxonomy(self.taxonomyB, "three") tagging_api.add_tag_to_taxonomy(self.taxonomyB, "four") + # Create a collection: + self.learning_package = authoring_api.get_learning_package_by_key(self.library.key) + self.collection_dict = { + 'id': 1, + 'type': 'collection', + 'display_name': 'my_collection', + 'description': 'my collection description', + 'context_key': 'lib:org1:lib', + 'org': 'org1', + 'created': created_date.timestamp(), + 'modified': created_date.timestamp(), + "access_id": lib_access.id, + 'breadcrumbs': [{'display_name': 'Library'}] + } + with freeze_time(created_date): + self.collection = authoring_api.create_collection( + learning_package_id=self.learning_package.id, + title="my_collection", + created_by=None, + description="my collection description" + ) + @override_settings(MEILISEARCH_ENABLED=False) def test_reindex_meilisearch_disabled(self, mock_meilisearch): with self.assertRaises(RuntimeError): @@ -199,10 +222,27 @@ def test_reindex_meilisearch(self, mock_meilisearch): [ call([doc_sequential, doc_vertical]), call([doc_problem1, doc_problem2]), + call([self.collection_dict]), ], any_order=True, ) + @override_settings(MEILISEARCH_ENABLED=True) + @patch( + "openedx.core.djangoapps.content.search.api.searchable_doc_for_collection", + Mock(side_effect=Exception("Failed to generate document")), + ) + def test_reindex_meilisearch_collection_error(self, mock_meilisearch): + + mock_logger = Mock() + api.rebuild_index(mock_logger) + assert call( + [self.collection_dict] + ) not in mock_meilisearch.return_value.index.return_value.add_documents.mock_calls + mock_logger.assert_any_call( + f"Error indexing collection {self.collection}: Failed to generate document" + ) + @override_settings(MEILISEARCH_ENABLED=True) def test_reindex_meilisearch_library_block_error(self, mock_meilisearch): diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index e853fd425273..6140411705bb 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -1,8 +1,12 @@ """ Tests for the Studio content search documents (what gets stored in the index) """ +from datetime import datetime, timezone from organizations.models import Organization +from freezegun import freeze_time +from openedx_learning.api import authoring as authoring_api + from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangolib.testing.utils import skip_unless_cms from xmodule.modulestore.django import modulestore @@ -11,10 +15,12 @@ try: # This import errors in the lms because content.search is not an installed app there. - from ..documents import searchable_doc_for_course_block, searchable_doc_tags + from ..documents import searchable_doc_for_course_block, searchable_doc_tags, searchable_doc_for_collection from ..models import SearchAccess except RuntimeError: searchable_doc_for_course_block = lambda x: x + searchable_doc_tags = lambda x: x + searchable_doc_for_collection = lambda x: x SearchAccess = {} @@ -198,3 +204,30 @@ def test_video_block_untagged(self): "content": {}, # This video has no tags. } + + def test_collection_with_no_library(self): + created_date = datetime(2023, 4, 5, 6, 7, 8, tzinfo=timezone.utc) + with freeze_time(created_date): + learning_package = authoring_api.create_learning_package( + key="course-v1:edX+toy+2012_Fall", + title="some learning_package", + description="some description", + ) + collection = authoring_api.create_collection( + learning_package_id=learning_package.id, + title="my_collection", + created_by=None, + description="my collection description" + ) + doc = searchable_doc_for_collection(collection) + assert doc == { + "id": collection.id, + "type": "collection", + "display_name": collection.title, + "description": collection.description, + "context_key": learning_package.key, + "access_id": self.toy_course_access_id, + "breadcrumbs": [{"display_name": learning_package.title}], + "created": created_date.timestamp(), + "modified": created_date.timestamp(), + } diff --git a/requirements/constraints.txt b/requirements/constraints.txt index b7c7bebc71d6..8c7e19a18a86 100644 --- a/requirements/constraints.txt +++ b/requirements/constraints.txt @@ -93,7 +93,7 @@ libsass==0.10.0 click==8.1.6 # pinning this version to avoid updates while the library is being developed -openedx-learning==0.11.1 +openedx-learning==0.11.2 # Open AI version 1.0.0 dropped support for openai.ChatCompletion which is currently in use in enterprise. openai<=0.28.1 diff --git a/requirements/edx/base.txt b/requirements/edx/base.txt index 1ed691bf5a51..640ccbe982e7 100644 --- a/requirements/edx/base.txt +++ b/requirements/edx/base.txt @@ -823,7 +823,7 @@ openedx-filters==1.9.0 # -r requirements/edx/kernel.in # lti-consumer-xblock # ora2 -openedx-learning==0.11.1 +openedx-learning==0.11.2 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/kernel.in diff --git a/requirements/edx/development.txt b/requirements/edx/development.txt index 0f88bad4b5b6..9ae45b7ff72c 100644 --- a/requirements/edx/development.txt +++ b/requirements/edx/development.txt @@ -1372,7 +1372,7 @@ openedx-filters==1.9.0 # -r requirements/edx/testing.txt # lti-consumer-xblock # ora2 -openedx-learning==0.11.1 +openedx-learning==0.11.2 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/doc.txt diff --git a/requirements/edx/doc.txt b/requirements/edx/doc.txt index 663e09b3d685..d315955694e7 100644 --- a/requirements/edx/doc.txt +++ b/requirements/edx/doc.txt @@ -982,7 +982,7 @@ openedx-filters==1.9.0 # -r requirements/edx/base.txt # lti-consumer-xblock # ora2 -openedx-learning==0.11.1 +openedx-learning==0.11.2 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/base.txt diff --git a/requirements/edx/testing.txt b/requirements/edx/testing.txt index e8ed020e5128..fcb7db05a5ef 100644 --- a/requirements/edx/testing.txt +++ b/requirements/edx/testing.txt @@ -1033,7 +1033,7 @@ openedx-filters==1.9.0 # -r requirements/edx/base.txt # lti-consumer-xblock # ora2 -openedx-learning==0.11.1 +openedx-learning==0.11.2 # via # -c requirements/edx/../constraints.txt # -r requirements/edx/base.txt