From 9c457b64104c309524f7a069777c1990da9f1511 Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Fri, 15 Nov 2024 09:56:07 -0300 Subject: [PATCH 1/7] feat: incremental reindex_studio management command --- openedx/core/djangoapps/content/search/api.py | 207 +++++++++++------- .../management/commands/reindex_studio.py | 14 +- .../0002_incrementalindexcompleted.py | 21 ++ .../core/djangoapps/content/search/models.py | 9 + .../content/search/tests/test_api.py | 70 ++++++ 5 files changed, 241 insertions(+), 80 deletions(-) create mode 100644 openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index b1d224b411e4..9d0a2379a0bb 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -5,7 +5,7 @@ import logging import time -from contextlib import contextmanager +from contextlib import contextmanager, nullcontext from datetime import datetime, timedelta, timezone from functools import wraps from typing import Callable, Generator @@ -24,7 +24,7 @@ from rest_framework.request import Request from common.djangoapps.student.role_helpers import get_course_roles from openedx.core.djangoapps.content.course_overviews.models import CourseOverview -from openedx.core.djangoapps.content.search.models import get_access_ids_for_request +from openedx.core.djangoapps.content.search.models import get_access_ids_for_request, IncrementalIndexCompleted from openedx.core.djangoapps.content_libraries import api as lib_api from xmodule.modulestore.django import modulestore @@ -217,6 +217,83 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat _wait_for_meili_task(client.delete_index(temp_index_name)) +def _configure_index(index_name): + """ + Configure the index. The following index settings are best changed on an empty index. + Changing them on a populated index will "re-index all documents in the index", which can take some time. + + Args: + index_name (str): The name of the index to configure + """ + client = _get_meilisearch_client() + + # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): + client.index(index_name).update_distinct_attribute(Fields.usage_key) + # Mark which attributes can be used for filtering/faceted search: + client.index(index_name).update_filterable_attributes([ + # Get specific block/collection using combination of block_id and context_key + Fields.block_id, + Fields.block_type, + Fields.context_key, + Fields.usage_key, + Fields.org, + Fields.tags, + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.type, + Fields.access_id, + Fields.last_published, + Fields.content + "." + Fields.problem_types, + ]) + # Mark which attributes are used for keyword search, in order of importance: + client.index(index_name).update_searchable_attributes([ + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.description, + Fields.tags, + Fields.collections, + # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they + # are searchable only if at least one document in the index has a value. If we didn't list them here and, + # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for + # these sub-fields: "Attribute `tags.level3` is not searchable." + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.published + "." + Fields.display_name, + Fields.published + "." + Fields.published_description, + ]) + # Mark which attributes can be used for sorting search results: + client.index(index_name).update_sortable_attributes([ + Fields.display_name, + Fields.created, + Fields.modified, + Fields.last_published, + ]) + + # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. + # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy + client.index(index_name).update_ranking_rules([ + "sort", + "words", + "typo", + "proximity", + "attribute", + "exactness", + ]) + + def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None: """ Recurse the children of an XBlock and call the given function for each @@ -279,8 +356,31 @@ def is_meilisearch_enabled() -> bool: return False -# pylint: disable=too-many-statements -def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: +def reset_index(status_cb: Callable[[str], None] | None = None) -> None: + if status_cb is None: + status_cb = log.info + + status_cb("Creating new empty index...") + with _using_temp_index(status_cb) as temp_index_name: + _configure_index(temp_index_name) + status_cb("Index recreated!") + status_cb("Index reset complete.") + + +def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None: + if status_cb is None: + status_cb = log.info + if warn_cb is None: + warn_cb = log.warning + + if _index_exists(STUDIO_INDEX_NAME): + warn_cb("A rebuild of the index is required. Please run ./manage.py cms reindex_studio --experimental [--incremental]") + return + + reset_index(status_cb) + + +def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=False) -> None: """ Rebuild the Meilisearch index from scratch """ @@ -292,7 +392,14 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: # Get the lists of libraries status_cb("Counting libraries...") - lib_keys = [lib.library_key for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug')] + keys_indexed = [] + if incremental: + keys_indexed = list(IncrementalIndexCompleted.objects.values_list('context_key', flat=True)) + lib_keys = [ + lib.library_key + for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug').order_by('-id') + if lib.library_key not in keys_indexed + ] num_libraries = len(lib_keys) # Get the list of courses @@ -305,83 +412,19 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None: num_blocks_done = 0 # How many individual components/XBlocks we've indexed status_cb(f"Found {num_courses} courses, {num_libraries} libraries.") - with _using_temp_index(status_cb) as temp_index_name: + with _using_temp_index(status_cb) if not incremental else nullcontext(STUDIO_INDEX_NAME) as index_name: ############## Configure the index ############## - # The following index settings are best changed on an empty index. - # Changing them on a populated index will "re-index all documents in the index, which can take some time" + # The index settings are best changed on an empty index. + # Changing them on a populated index will "re-index all documents in the index", which can take some time # and use more RAM. Instead, we configure an empty index then populate it one course/library at a time. - - # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): - client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) - # Mark which attributes can be used for filtering/faceted search: - client.index(temp_index_name).update_filterable_attributes([ - # Get specific block/collection using combination of block_id and context_key - Fields.block_id, - Fields.block_type, - Fields.context_key, - Fields.usage_key, - Fields.org, - Fields.tags, - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.type, - Fields.access_id, - Fields.last_published, - Fields.content + "." + Fields.problem_types, - ]) - # Mark which attributes are used for keyword search, in order of importance: - client.index(temp_index_name).update_searchable_attributes([ - # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. - Fields.display_name, - Fields.block_id, - Fields.content, - Fields.description, - Fields.tags, - Fields.collections, - # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they - # are searchable only if at least one document in the index has a value. If we didn't list them here and, - # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for - # these sub-fields: "Attribute `tags.level3` is not searchable." - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.published + "." + Fields.display_name, - Fields.published + "." + Fields.published_description, - ]) - # Mark which attributes can be used for sorting search results: - client.index(temp_index_name).update_sortable_attributes([ - Fields.display_name, - Fields.created, - Fields.modified, - Fields.last_published, - ]) - - # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. - # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy - client.index(temp_index_name).update_ranking_rules([ - "sort", - "words", - "typo", - "proximity", - "attribute", - "exactness", - ]) + if not incremental: + _configure_index(index_name) ############## Libraries ############## status_cb("Indexing libraries...") - def index_library(lib_key: str) -> list: + def index_library(lib_key: LibraryLocatorV2) -> list: docs = [] for component in lib_api.get_library_components(lib_key): try: @@ -396,7 +439,7 @@ def index_library(lib_key: str) -> list: if docs: try: # Add all the docs in this library at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) except (TypeError, KeyError, MeilisearchError) as err: status_cb(f"Error indexing library {lib_key}: {err}") return docs @@ -416,7 +459,7 @@ def index_collection_batch(batch, num_done, library_key) -> int: if docs: try: # Add docs in batch of 100 at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) except (TypeError, KeyError, MeilisearchError) as err: status_cb(f"Error indexing collection batch {p}: {err}") return num_done @@ -439,6 +482,8 @@ def index_collection_batch(batch, num_done, library_key) -> int: num_collections_done, lib_key, ) + if incremental: + IncrementalIndexCompleted.objects.get_or_create(context_key=lib_key) status_cb(f"{num_collections_done}/{num_collections} collections indexed for library {lib_key}") num_contexts_done += 1 @@ -464,7 +509,7 @@ def add_with_children(block): if docs: # Add all the docs in this course at once (usually faster than adding one at a time): - _wait_for_meili_task(client.index(temp_index_name).add_documents(docs)) + _wait_for_meili_task(client.index(index_name).add_documents(docs)) return docs paginator = Paginator(CourseOverview.objects.only('id', 'display_name'), 1000) @@ -473,10 +518,16 @@ def add_with_children(block): status_cb( f"{num_contexts_done + 1}/{num_contexts}. Now indexing course {course.display_name} ({course.id})" ) + if course.id in keys_indexed: + num_contexts_done += 1 + continue course_docs = index_course(course) + if incremental: + IncrementalIndexCompleted.objects.get_or_create(context_key=course.id) num_contexts_done += 1 num_blocks_done += len(course_docs) + IncrementalIndexCompleted.objects.all().delete() status_cb(f"Done! {num_blocks_done} blocks indexed across {num_contexts_done} courses, collections and libraries.") diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 3767ebcba6c9..9fa2b30ea87a 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -19,7 +19,10 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('--experimental', action='store_true') - parser.set_defaults(experimental=False) + parser.add_argument('--reset', action='store_true') + parser.add_argument('--init', action='store_true') + parser.add_argument('--incremental', action='store_true') + parser.set_defaults(experimental=False, reset=False, init=False, incremental=False) def handle(self, *args, **options): """ @@ -34,4 +37,11 @@ def handle(self, *args, **options): "Use the --experimental argument to acknowledge and run it." ) - api.rebuild_index(self.stdout.write) + if options["reset"]: + api.reset_index(self.stdout.write) + elif options["init"]: + api.init_index(self.stdout.write) + elif options["incremental"]: + api.rebuild_index(self.stdout.write, incremental=True) + else: + api.rebuild_index(self.stdout.write) diff --git a/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py b/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py new file mode 100644 index 000000000000..a316c35a7dfe --- /dev/null +++ b/openedx/core/djangoapps/content/search/migrations/0002_incrementalindexcompleted.py @@ -0,0 +1,21 @@ +# Generated by Django 4.2.16 on 2024-11-15 12:40 + +from django.db import migrations, models +import opaque_keys.edx.django.models + + +class Migration(migrations.Migration): + + dependencies = [ + ('search', '0001_initial'), + ] + + operations = [ + migrations.CreateModel( + name='IncrementalIndexCompleted', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('context_key', opaque_keys.edx.django.models.LearningContextKeyField(max_length=255, unique=True)), + ], + ), + ] diff --git a/openedx/core/djangoapps/content/search/models.py b/openedx/core/djangoapps/content/search/models.py index 711c493ff895..91e12affc326 100644 --- a/openedx/core/djangoapps/content/search/models.py +++ b/openedx/core/djangoapps/content/search/models.py @@ -65,3 +65,12 @@ def get_access_ids_for_request(request: Request, omit_orgs: list[str] = None) -> course_clause | library_clause ).order_by('-id').values_list("id", flat=True) ) + + +class IncrementalIndexCompleted(models.Model): + """ + Stores the contex keys of aleady indexed courses and libraries for incremental indexing. + """ + context_key = LearningContextKeyField( + max_length=255, unique=True, null=False, + ) diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py index 0aa762fd187f..1c4961ac9838 100644 --- a/openedx/core/djangoapps/content/search/tests/test_api.py +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -10,12 +10,15 @@ from opaque_keys.edx.keys import UsageKey import ddt +import pytest from django.test import override_settings from freezegun import freeze_time +from meilisearch.errors import MeilisearchApiError from openedx_learning.api import authoring as authoring_api from organizations.tests.factories import OrganizationFactory from common.djangoapps.student.tests.factories import UserFactory +from openedx.core.djangoapps.content.search.models import IncrementalIndexCompleted from openedx.core.djangoapps.content_libraries import api as library_api from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangoapps.content.course_overviews.api import CourseOverview @@ -239,6 +242,73 @@ def test_reindex_meilisearch(self, mock_meilisearch): any_order=True, ) + @override_settings(MEILISEARCH_ENABLED=True) + def test_reindex_meilisearch_incremental(self, mock_meilisearch): + + # Add tags field to doc, since reindex calls includes tags + doc_sequential = copy.deepcopy(self.doc_sequential) + doc_sequential["tags"] = {} + doc_vertical = copy.deepcopy(self.doc_vertical) + doc_vertical["tags"] = {} + doc_problem1 = copy.deepcopy(self.doc_problem1) + doc_problem1["tags"] = {} + doc_problem1["collections"] = {'display_name': [], 'key': []} + doc_problem2 = copy.deepcopy(self.doc_problem2) + doc_problem2["tags"] = {} + doc_problem2["collections"] = {'display_name': [], 'key': []} + doc_collection = copy.deepcopy(self.collection_dict) + doc_collection["tags"] = {} + + api.rebuild_index(incremental=True) + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 3 + mock_meilisearch.return_value.index.return_value.add_documents.assert_has_calls( + [ + call([doc_sequential, doc_vertical]), + call([doc_problem1, doc_problem2]), + call([doc_collection]), + ], + any_order=True, + ) + + # Now we simulate interruption by patching _wait_for_meili_task to raise an exception + def simulated_interruption(): + yield + yield + raise Exception("Simulated interruption") + with patch("openedx.core.djangoapps.content.search.api._wait_for_meili_task", side_effect=simulated_interruption()): + with pytest.raises(Exception, match="Simulated interruption"): + api.rebuild_index(incremental=True) + assert IncrementalIndexCompleted.objects.all().count() == 1 + api.rebuild_index(incremental=True) + assert IncrementalIndexCompleted.objects.all().count() == 0 + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 7 + + @override_settings(MEILISEARCH_ENABLED=True) + def test_reset_meilisearch_index(self, mock_meilisearch): + api.reset_index() + mock_meilisearch.return_value.swap_indexes.assert_called_once() + mock_meilisearch.return_value.create_index.assert_called_once() + mock_meilisearch.return_value.delete_index.call_count = 2 + api.reset_index() + mock_meilisearch.return_value.delete_index.call_count = 4 + + @override_settings(MEILISEARCH_ENABLED=True) + def test_init_meilisearch_index(self, mock_meilisearch): + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_not_called() + mock_meilisearch.return_value.create_index.assert_not_called() + mock_meilisearch.return_value.delete_index.assert_not_called() + + mock_meilisearch.return_value.get_index.side_effect = [ + MeilisearchApiError("Testing reindex", Mock(code="index_not_found", text=None)), + MeilisearchApiError("Testing reindex", Mock(code="index_not_found", text=None)), + Mock(), + ] + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_called_once() + mock_meilisearch.return_value.create_index.assert_called_once() + mock_meilisearch.return_value.delete_index.call_count = 2 + @override_settings(MEILISEARCH_ENABLED=True) @patch( "openedx.core.djangoapps.content.search.api.searchable_doc_for_collection", From f54cbb461f3df94deedd8443fd249b32676ec4a0 Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Sat, 16 Nov 2024 01:10:48 -0300 Subject: [PATCH 2/7] fix: tests, linting and formatting --- openedx/core/djangoapps/content/search/api.py | 139 ++++++++++-------- .../management/commands/reindex_studio.py | 8 +- .../core/djangoapps/content/search/models.py | 5 +- .../content/search/tests/test_api.py | 37 +++-- 4 files changed, 107 insertions(+), 82 deletions(-) diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index 9d0a2379a0bb..443bae2b1a65 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -230,68 +230,76 @@ def _configure_index(index_name): # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): client.index(index_name).update_distinct_attribute(Fields.usage_key) # Mark which attributes can be used for filtering/faceted search: - client.index(index_name).update_filterable_attributes([ - # Get specific block/collection using combination of block_id and context_key - Fields.block_id, - Fields.block_type, - Fields.context_key, - Fields.usage_key, - Fields.org, - Fields.tags, - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.type, - Fields.access_id, - Fields.last_published, - Fields.content + "." + Fields.problem_types, - ]) + client.index(index_name).update_filterable_attributes( + [ + # Get specific block/collection using combination of block_id and context_key + Fields.block_id, + Fields.block_type, + Fields.context_key, + Fields.usage_key, + Fields.org, + Fields.tags, + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.type, + Fields.access_id, + Fields.last_published, + Fields.content + "." + Fields.problem_types, + ] + ) # Mark which attributes are used for keyword search, in order of importance: - client.index(index_name).update_searchable_attributes([ - # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. - Fields.display_name, - Fields.block_id, - Fields.content, - Fields.description, - Fields.tags, - Fields.collections, - # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they - # are searchable only if at least one document in the index has a value. If we didn't list them here and, - # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for - # these sub-fields: "Attribute `tags.level3` is not searchable." - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.published + "." + Fields.display_name, - Fields.published + "." + Fields.published_description, - ]) + client.index(index_name).update_searchable_attributes( + [ + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.description, + Fields.tags, + Fields.collections, + # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they + # are searchable only if at least one document in the index has a value. If we didn't list them here and, + # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for + # these sub-fields: "Attribute `tags.level3` is not searchable." + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.published + "." + Fields.display_name, + Fields.published + "." + Fields.published_description, + ] + ) # Mark which attributes can be used for sorting search results: - client.index(index_name).update_sortable_attributes([ - Fields.display_name, - Fields.created, - Fields.modified, - Fields.last_published, - ]) + client.index(index_name).update_sortable_attributes( + [ + Fields.display_name, + Fields.created, + Fields.modified, + Fields.last_published, + ] + ) # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy - client.index(index_name).update_ranking_rules([ - "sort", - "words", - "typo", - "proximity", - "attribute", - "exactness", - ]) + client.index(index_name).update_ranking_rules( + [ + "sort", + "words", + "typo", + "proximity", + "attribute", + "exactness", + ] + ) def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None: @@ -357,6 +365,9 @@ def is_meilisearch_enabled() -> bool: def reset_index(status_cb: Callable[[str], None] | None = None) -> None: + """ + Reset the Meilisearch index, deleting all documents and reconfiguring it + """ if status_cb is None: status_cb = log.info @@ -368,19 +379,25 @@ def reset_index(status_cb: Callable[[str], None] | None = None) -> None: def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None: + """ + Initialize the Meilisearch index, creating it and configuring it if it doesn't exist + """ if status_cb is None: status_cb = log.info if warn_cb is None: warn_cb = log.warning if _index_exists(STUDIO_INDEX_NAME): - warn_cb("A rebuild of the index is required. Please run ./manage.py cms reindex_studio --experimental [--incremental]") + warn_cb( + "A rebuild of the index is required. Please run ./manage.py cms reindex_studio" + " --experimental [--incremental]" + ) return reset_index(status_cb) -def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=False) -> None: +def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=False) -> None: # lint-amnesty, pylint: disable=too-many-statements """ Rebuild the Meilisearch index from scratch """ @@ -394,10 +411,10 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=Fa status_cb("Counting libraries...") keys_indexed = [] if incremental: - keys_indexed = list(IncrementalIndexCompleted.objects.values_list('context_key', flat=True)) + keys_indexed = list(IncrementalIndexCompleted.objects.values_list("context_key", flat=True)) lib_keys = [ lib.library_key - for lib in lib_api.ContentLibrary.objects.select_related('org').only('org', 'slug').order_by('-id') + for lib in lib_api.ContentLibrary.objects.select_related("org").only("org", "slug").order_by("-id") if lib.library_key not in keys_indexed ] num_libraries = len(lib_keys) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 9fa2b30ea87a..e06070927b7d 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -18,10 +18,10 @@ class Command(BaseCommand): """ def add_arguments(self, parser): - parser.add_argument('--experimental', action='store_true') - parser.add_argument('--reset', action='store_true') - parser.add_argument('--init', action='store_true') - parser.add_argument('--incremental', action='store_true') + parser.add_argument("--experimental", action="store_true") + parser.add_argument("--reset", action="store_true") + parser.add_argument("--init", action="store_true") + parser.add_argument("--incremental", action="store_true") parser.set_defaults(experimental=False, reset=False, init=False, incremental=False) def handle(self, *args, **options): diff --git a/openedx/core/djangoapps/content/search/models.py b/openedx/core/djangoapps/content/search/models.py index 91e12affc326..6fa53ef17b34 100644 --- a/openedx/core/djangoapps/content/search/models.py +++ b/openedx/core/djangoapps/content/search/models.py @@ -71,6 +71,9 @@ class IncrementalIndexCompleted(models.Model): """ Stores the contex keys of aleady indexed courses and libraries for incremental indexing. """ + context_key = LearningContextKeyField( - max_length=255, unique=True, null=False, + max_length=255, + unique=True, + null=False, ) diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py index 1c4961ac9838..ac472c510c8b 100644 --- a/openedx/core/djangoapps/content/search/tests/test_api.py +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -18,7 +18,6 @@ from organizations.tests.factories import OrganizationFactory from common.djangoapps.student.tests.factories import UserFactory -from openedx.core.djangoapps.content.search.models import IncrementalIndexCompleted from openedx.core.djangoapps.content_libraries import api as library_api from openedx.core.djangoapps.content_tagging import api as tagging_api from openedx.core.djangoapps.content.course_overviews.api import CourseOverview @@ -29,7 +28,7 @@ try: # This import errors in the lms because content.search is not an installed app there. from .. import api - from ..models import SearchAccess + from ..models import SearchAccess, IncrementalIndexCompleted except RuntimeError: SearchAccess = {} @@ -252,10 +251,10 @@ def test_reindex_meilisearch_incremental(self, mock_meilisearch): doc_vertical["tags"] = {} doc_problem1 = copy.deepcopy(self.doc_problem1) doc_problem1["tags"] = {} - doc_problem1["collections"] = {'display_name': [], 'key': []} + doc_problem1["collections"] = {"display_name": [], "key": []} doc_problem2 = copy.deepcopy(self.doc_problem2) doc_problem2["tags"] = {} - doc_problem2["collections"] = {'display_name': [], 'key': []} + doc_problem2["collections"] = {"display_name": [], "key": []} doc_collection = copy.deepcopy(self.collection_dict) doc_collection["tags"] = {} @@ -270,18 +269,22 @@ def test_reindex_meilisearch_incremental(self, mock_meilisearch): any_order=True, ) - # Now we simulate interruption by patching _wait_for_meili_task to raise an exception - def simulated_interruption(): - yield - yield - raise Exception("Simulated interruption") - with patch("openedx.core.djangoapps.content.search.api._wait_for_meili_task", side_effect=simulated_interruption()): - with pytest.raises(Exception, match="Simulated interruption"): - api.rebuild_index(incremental=True) + # Now we simulate interruption by passing this function to the status_cb argument + def simulated_interruption(message): + # this exception prevents courses from being indexed + if "Indexing courses" in message: + raise Exception("Simulated interruption") + + with pytest.raises(Exception, match="Simulated interruption"): + api.rebuild_index(simulated_interruption, incremental=True) + + # two more calls due to collections + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 5 assert IncrementalIndexCompleted.objects.all().count() == 1 api.rebuild_index(incremental=True) assert IncrementalIndexCompleted.objects.all().count() == 0 - assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 7 + # one missing course indexed + assert mock_meilisearch.return_value.index.return_value.add_documents.call_count == 6 @override_settings(MEILISEARCH_ENABLED=True) def test_reset_meilisearch_index(self, mock_meilisearch): @@ -300,9 +303,11 @@ def test_init_meilisearch_index(self, mock_meilisearch): mock_meilisearch.return_value.delete_index.assert_not_called() mock_meilisearch.return_value.get_index.side_effect = [ - MeilisearchApiError("Testing reindex", Mock(code="index_not_found", text=None)), - MeilisearchApiError("Testing reindex", Mock(code="index_not_found", text=None)), - Mock(), + MeilisearchApiError("Testing reindex", Mock(text='{"code":"index_not_found"}')), + MeilisearchApiError("Testing reindex", Mock(text='{"code":"index_not_found"}')), + Mock(created_at=1), + Mock(created_at=1), + Mock(created_at=1), ] api.init_index() mock_meilisearch.return_value.swap_indexes.assert_called_once() From 0643c9b536a013006ee07367160b4b37c395a282 Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Mon, 18 Nov 2024 20:36:41 -0300 Subject: [PATCH 3/7] fix: improve output of init_index --- .../content/search/management/commands/reindex_studio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index e06070927b7d..2d8bb29f7a1f 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -40,7 +40,7 @@ def handle(self, *args, **options): if options["reset"]: api.reset_index(self.stdout.write) elif options["init"]: - api.init_index(self.stdout.write) + api.init_index(self.stdout.write, self.stderr.write) elif options["incremental"]: api.rebuild_index(self.stdout.write, incremental=True) else: From 077e426f05b893548529e37310570241ba40f77b Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Thu, 28 Nov 2024 01:28:20 -0300 Subject: [PATCH 4/7] fix: address bradens comments --- openedx/core/djangoapps/content/search/api.py | 190 +++++++++++------- .../content/search/tests/test_api.py | 8 + 2 files changed, 126 insertions(+), 72 deletions(-) diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index 443bae2b1a65..297505a21b26 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -62,6 +62,65 @@ EXCLUDED_XBLOCK_TYPES = ['course', 'course_info'] +INDEX_DISTINCT_ATTRIBUTE = "usage_key" +INDEX_FILTRABLE_ATTRIBUTES = [ + # Get specific block/collection using combination of block_id and context_key + Fields.block_id, + Fields.block_type, + Fields.context_key, + Fields.usage_key, + Fields.org, + Fields.tags, + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.type, + Fields.access_id, + Fields.last_published, + Fields.content + "." + Fields.problem_types, +] +INDEX_SEARCHABLE_ATTRIBUTES = [ + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.description, + Fields.tags, + Fields.collections, + # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they + # are searchable only if at least one document in the index has a value. If we didn't list them here and, + # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for + # these sub-fields: "Attribute `tags.level3` is not searchable." + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.published + "." + Fields.display_name, + Fields.published + "." + Fields.published_description, +] +INDEX_SORTABLE_ATTRIBUTES = [ + Fields.display_name, + Fields.created, + Fields.modified, + Fields.last_published, +] +INDEX_RANKING_RULES = [ + "sort", + "words", + "typo", + "proximity", + "attribute", + "exactness", +] + @contextmanager def _index_rebuild_lock() -> Generator[str, None, None]: @@ -217,6 +276,18 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat _wait_for_meili_task(client.delete_index(temp_index_name)) +def _index_is_empty(index_name: str) -> bool: + """ + Check if an index is empty + + Args: + index_name (str): The name of the index to check + """ + client = _get_meilisearch_client() + index = client.get_index(index_name) + return index.get_stats().number_of_documents == 0 + + def _configure_index(index_name): """ Configure the index. The following index settings are best changed on an empty index. @@ -228,78 +299,17 @@ def _configure_index(index_name): client = _get_meilisearch_client() # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): - client.index(index_name).update_distinct_attribute(Fields.usage_key) + client.index(index_name).update_distinct_attribute(INDEX_DISTINCT_ATTRIBUTE) # Mark which attributes can be used for filtering/faceted search: - client.index(index_name).update_filterable_attributes( - [ - # Get specific block/collection using combination of block_id and context_key - Fields.block_id, - Fields.block_type, - Fields.context_key, - Fields.usage_key, - Fields.org, - Fields.tags, - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.type, - Fields.access_id, - Fields.last_published, - Fields.content + "." + Fields.problem_types, - ] - ) + client.index(index_name).update_filterable_attributes(INDEX_FILTRABLE_ATTRIBUTES) # Mark which attributes are used for keyword search, in order of importance: - client.index(index_name).update_searchable_attributes( - [ - # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. - Fields.display_name, - Fields.block_id, - Fields.content, - Fields.description, - Fields.tags, - Fields.collections, - # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they - # are searchable only if at least one document in the index has a value. If we didn't list them here and, - # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for - # these sub-fields: "Attribute `tags.level3` is not searchable." - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.published + "." + Fields.display_name, - Fields.published + "." + Fields.published_description, - ] - ) + client.index(index_name).update_searchable_attributes(INDEX_SEARCHABLE_ATTRIBUTES) # Mark which attributes can be used for sorting search results: - client.index(index_name).update_sortable_attributes( - [ - Fields.display_name, - Fields.created, - Fields.modified, - Fields.last_published, - ] - ) + client.index(index_name).update_sortable_attributes(INDEX_SORTABLE_ATTRIBUTES) # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy - client.index(index_name).update_ranking_rules( - [ - "sort", - "words", - "typo", - "proximity", - "attribute", - "exactness", - ] - ) + client.index(index_name).update_ranking_rules(INDEX_RANKING_RULES) def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None: @@ -378,6 +388,32 @@ def reset_index(status_cb: Callable[[str], None] | None = None) -> None: status_cb("Index reset complete.") +def _is_index_configured(index_name: str) -> bool: + """ + Check if an index is completely configured + + Args: + index_name (str): The name of the index to check + """ + client = _get_meilisearch_client() + index = client.get_index(index_name) + settings = index.get_settings() + for k, v in ( + ("distinctAttribute", INDEX_DISTINCT_ATTRIBUTE), + ("filterableAttributes", INDEX_FILTRABLE_ATTRIBUTES), + ("searchableAttributes", INDEX_SEARCHABLE_ATTRIBUTES), + ("sortableAttributes", INDEX_SORTABLE_ATTRIBUTES), + ("rankingRules", INDEX_RANKING_RULES), + ): + setting = settings.get(k, []) + if isinstance(v, list): + v = set(v) + setting = set(setting) + if setting != v: + return False + return True + + def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None: """ Initialize the Meilisearch index, creating it and configuring it if it doesn't exist @@ -388,10 +424,19 @@ def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable warn_cb = log.warning if _index_exists(STUDIO_INDEX_NAME): - warn_cb( - "A rebuild of the index is required. Please run ./manage.py cms reindex_studio" - " --experimental [--incremental]" - ) + if _index_is_empty(STUDIO_INDEX_NAME): + warn_cb( + "The studio search index is empty. Please run ./manage.py cms reindex_studio" + " --experimental [--incremental]" + ) + return + if not _is_index_configured(STUDIO_INDEX_NAME): + warn_cb( + "A rebuild of the index is required. Please run ./manage.py cms reindex_studio" + " --experimental [--incremental]" + ) + return + status_cb("Index already exists and is configured.") return reset_index(status_cb) @@ -424,8 +469,9 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=Fa num_courses = CourseOverview.objects.count() # Some counters so we can track our progress as indexing progresses: - num_contexts = num_courses + num_libraries - num_contexts_done = 0 # How many courses/libraries we've indexed + num_libs_skipped = len(keys_indexed) + num_contexts = num_courses + num_libraries + num_libs_skipped + num_contexts_done = 0 + num_libs_skipped # How many courses/libraries we've indexed num_blocks_done = 0 # How many individual components/XBlocks we've indexed status_cb(f"Found {num_courses} courses, {num_libraries} libraries.") diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py index ac472c510c8b..c9c2b2589a31 100644 --- a/openedx/core/djangoapps/content/search/tests/test_api.py +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -297,6 +297,14 @@ def test_reset_meilisearch_index(self, mock_meilisearch): @override_settings(MEILISEARCH_ENABLED=True) def test_init_meilisearch_index(self, mock_meilisearch): + # Test index already exists + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_not_called() + mock_meilisearch.return_value.create_index.assert_not_called() + mock_meilisearch.return_value.delete_index.assert_not_called() + + # Test index already exists and has no documents + mock_meilisearch.return_value.get_stats.return_value = 0 api.init_index() mock_meilisearch.return_value.swap_indexes.assert_not_called() mock_meilisearch.return_value.create_index.assert_not_called() From 4a0e89c7282e6a0f568823421c01cab3324a73ff Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Thu, 28 Nov 2024 09:37:59 -0300 Subject: [PATCH 5/7] fix: settings name overshadow --- openedx/core/djangoapps/content/search/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index 297505a21b26..688223ece13b 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -397,7 +397,7 @@ def _is_index_configured(index_name: str) -> bool: """ client = _get_meilisearch_client() index = client.get_index(index_name) - settings = index.get_settings() + index_settings = index.get_settings() for k, v in ( ("distinctAttribute", INDEX_DISTINCT_ATTRIBUTE), ("filterableAttributes", INDEX_FILTRABLE_ATTRIBUTES), @@ -405,7 +405,7 @@ def _is_index_configured(index_name: str) -> bool: ("sortableAttributes", INDEX_SORTABLE_ATTRIBUTES), ("rankingRules", INDEX_RANKING_RULES), ): - setting = settings.get(k, []) + setting = index_settings.get(k, []) if isinstance(v, list): v = set(v) setting = set(setting) From 85d280a4a8cfef6bfc0cc908b0435a25c3a70602 Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Mon, 2 Dec 2024 21:14:24 -0300 Subject: [PATCH 6/7] refactor: extract constants to index config module --- openedx/core/djangoapps/content/search/api.py | 70 +++---------------- .../djangoapps/content/search/index_config.py | 69 ++++++++++++++++++ 2 files changed, 78 insertions(+), 61 deletions(-) create mode 100644 openedx/core/djangoapps/content/search/index_config.py diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index 688223ece13b..a18c55bd3d22 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -25,6 +25,13 @@ from common.djangoapps.student.role_helpers import get_course_roles from openedx.core.djangoapps.content.course_overviews.models import CourseOverview from openedx.core.djangoapps.content.search.models import get_access_ids_for_request, IncrementalIndexCompleted +from openedx.core.djangoapps.content.search.index_config import ( + INDEX_DISTINCT_ATTRIBUTE, + INDEX_FILTERABLE_ATTRIBUTES, + INDEX_SEARCHABLE_ATTRIBUTES, + INDEX_SORTABLE_ATTRIBUTES, + INDEX_RANKING_RULES, +) from openedx.core.djangoapps.content_libraries import api as lib_api from xmodule.modulestore.django import modulestore @@ -62,65 +69,6 @@ EXCLUDED_XBLOCK_TYPES = ['course', 'course_info'] -INDEX_DISTINCT_ATTRIBUTE = "usage_key" -INDEX_FILTRABLE_ATTRIBUTES = [ - # Get specific block/collection using combination of block_id and context_key - Fields.block_id, - Fields.block_type, - Fields.context_key, - Fields.usage_key, - Fields.org, - Fields.tags, - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.type, - Fields.access_id, - Fields.last_published, - Fields.content + "." + Fields.problem_types, -] -INDEX_SEARCHABLE_ATTRIBUTES = [ - # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. - Fields.display_name, - Fields.block_id, - Fields.content, - Fields.description, - Fields.tags, - Fields.collections, - # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they - # are searchable only if at least one document in the index has a value. If we didn't list them here and, - # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for - # these sub-fields: "Attribute `tags.level3` is not searchable." - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.published + "." + Fields.display_name, - Fields.published + "." + Fields.published_description, -] -INDEX_SORTABLE_ATTRIBUTES = [ - Fields.display_name, - Fields.created, - Fields.modified, - Fields.last_published, -] -INDEX_RANKING_RULES = [ - "sort", - "words", - "typo", - "proximity", - "attribute", - "exactness", -] - @contextmanager def _index_rebuild_lock() -> Generator[str, None, None]: @@ -301,7 +249,7 @@ def _configure_index(index_name): # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): client.index(index_name).update_distinct_attribute(INDEX_DISTINCT_ATTRIBUTE) # Mark which attributes can be used for filtering/faceted search: - client.index(index_name).update_filterable_attributes(INDEX_FILTRABLE_ATTRIBUTES) + client.index(index_name).update_filterable_attributes(INDEX_FILTERABLE_ATTRIBUTES) # Mark which attributes are used for keyword search, in order of importance: client.index(index_name).update_searchable_attributes(INDEX_SEARCHABLE_ATTRIBUTES) # Mark which attributes can be used for sorting search results: @@ -400,7 +348,7 @@ def _is_index_configured(index_name: str) -> bool: index_settings = index.get_settings() for k, v in ( ("distinctAttribute", INDEX_DISTINCT_ATTRIBUTE), - ("filterableAttributes", INDEX_FILTRABLE_ATTRIBUTES), + ("filterableAttributes", INDEX_FILTERABLE_ATTRIBUTES), ("searchableAttributes", INDEX_SEARCHABLE_ATTRIBUTES), ("sortableAttributes", INDEX_SORTABLE_ATTRIBUTES), ("rankingRules", INDEX_RANKING_RULES), diff --git a/openedx/core/djangoapps/content/search/index_config.py b/openedx/core/djangoapps/content/search/index_config.py new file mode 100644 index 000000000000..e60db0e28dc7 --- /dev/null +++ b/openedx/core/djangoapps/content/search/index_config.py @@ -0,0 +1,69 @@ +from .documents import Fields + + +INDEX_DISTINCT_ATTRIBUTE = "usage_key" + +# Mark which attributes can be used for filtering/faceted search: +INDEX_FILTERABLE_ATTRIBUTES = [ + # Get specific block/collection using combination of block_id and context_key + Fields.block_id, + Fields.block_type, + Fields.context_key, + Fields.usage_key, + Fields.org, + Fields.tags, + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.type, + Fields.access_id, + Fields.last_published, + Fields.content + "." + Fields.problem_types, +] + +# Mark which attributes are used for keyword search, in order of importance: +INDEX_SEARCHABLE_ATTRIBUTES = [ + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.description, + Fields.tags, + Fields.collections, + # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they + # are searchable only if at least one document in the index has a value. If we didn't list them here and, + # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for + # these sub-fields: "Attribute `tags.level3` is not searchable." + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.published + "." + Fields.display_name, + Fields.published + "." + Fields.published_description, +] + +# Mark which attributes can be used for sorting search results: +INDEX_SORTABLE_ATTRIBUTES = [ + Fields.display_name, + Fields.created, + Fields.modified, + Fields.last_published, +] + +# Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. +INDEX_RANKING_RULES = [ + "sort", + "words", + "typo", + "proximity", + "attribute", + "exactness", +] From c41c44d441aaf8b7521c9e3c7dadd4a4057e0af2 Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Mon, 2 Dec 2024 22:16:56 -0300 Subject: [PATCH 7/7] fix: add index config docstring --- openedx/core/djangoapps/content/search/index_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openedx/core/djangoapps/content/search/index_config.py b/openedx/core/djangoapps/content/search/index_config.py index e60db0e28dc7..9570956e425e 100644 --- a/openedx/core/djangoapps/content/search/index_config.py +++ b/openedx/core/djangoapps/content/search/index_config.py @@ -1,3 +1,4 @@ +"""Configuration for the search index.""" from .documents import Fields