From 73957b6ea5325b30c7119706709906916e913aed Mon Sep 17 00:00:00 2001 From: Daniel Valenzuela Date: Thu, 28 Nov 2024 01:28:20 -0300 Subject: [PATCH] fix: address bradens comments --- openedx/core/djangoapps/content/search/api.py | 190 +++++++++++------- .../content/search/tests/test_api.py | 8 + 2 files changed, 126 insertions(+), 72 deletions(-) diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py index 443bae2b1a65..297505a21b26 100644 --- a/openedx/core/djangoapps/content/search/api.py +++ b/openedx/core/djangoapps/content/search/api.py @@ -62,6 +62,65 @@ EXCLUDED_XBLOCK_TYPES = ['course', 'course_info'] +INDEX_DISTINCT_ATTRIBUTE = "usage_key" +INDEX_FILTRABLE_ATTRIBUTES = [ + # Get specific block/collection using combination of block_id and context_key + Fields.block_id, + Fields.block_type, + Fields.context_key, + Fields.usage_key, + Fields.org, + Fields.tags, + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.type, + Fields.access_id, + Fields.last_published, + Fields.content + "." + Fields.problem_types, +] +INDEX_SEARCHABLE_ATTRIBUTES = [ + # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. + Fields.display_name, + Fields.block_id, + Fields.content, + Fields.description, + Fields.tags, + Fields.collections, + # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they + # are searchable only if at least one document in the index has a value. If we didn't list them here and, + # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for + # these sub-fields: "Attribute `tags.level3` is not searchable." + Fields.tags + "." + Fields.tags_taxonomy, + Fields.tags + "." + Fields.tags_level0, + Fields.tags + "." + Fields.tags_level1, + Fields.tags + "." + Fields.tags_level2, + Fields.tags + "." + Fields.tags_level3, + Fields.collections + "." + Fields.collections_display_name, + Fields.collections + "." + Fields.collections_key, + Fields.published + "." + Fields.display_name, + Fields.published + "." + Fields.published_description, +] +INDEX_SORTABLE_ATTRIBUTES = [ + Fields.display_name, + Fields.created, + Fields.modified, + Fields.last_published, +] +INDEX_RANKING_RULES = [ + "sort", + "words", + "typo", + "proximity", + "attribute", + "exactness", +] + @contextmanager def _index_rebuild_lock() -> Generator[str, None, None]: @@ -217,6 +276,18 @@ def _using_temp_index(status_cb: Callable[[str], None] | None = None) -> Generat _wait_for_meili_task(client.delete_index(temp_index_name)) +def _index_is_empty(index_name: str) -> bool: + """ + Check if an index is empty + + Args: + index_name (str): The name of the index to check + """ + client = _get_meilisearch_client() + index = client.get_index(index_name) + return index.get_stats().number_of_documents == 0 + + def _configure_index(index_name): """ Configure the index. The following index settings are best changed on an empty index. @@ -228,78 +299,17 @@ def _configure_index(index_name): client = _get_meilisearch_client() # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): - client.index(index_name).update_distinct_attribute(Fields.usage_key) + client.index(index_name).update_distinct_attribute(INDEX_DISTINCT_ATTRIBUTE) # Mark which attributes can be used for filtering/faceted search: - client.index(index_name).update_filterable_attributes( - [ - # Get specific block/collection using combination of block_id and context_key - Fields.block_id, - Fields.block_type, - Fields.context_key, - Fields.usage_key, - Fields.org, - Fields.tags, - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.type, - Fields.access_id, - Fields.last_published, - Fields.content + "." + Fields.problem_types, - ] - ) + client.index(index_name).update_filterable_attributes(INDEX_FILTRABLE_ATTRIBUTES) # Mark which attributes are used for keyword search, in order of importance: - client.index(index_name).update_searchable_attributes( - [ - # Keyword search does _not_ search the course name, course ID, breadcrumbs, block type, or other fields. - Fields.display_name, - Fields.block_id, - Fields.content, - Fields.description, - Fields.tags, - Fields.collections, - # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they - # are searchable only if at least one document in the index has a value. If we didn't list them here and, - # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for - # these sub-fields: "Attribute `tags.level3` is not searchable." - Fields.tags + "." + Fields.tags_taxonomy, - Fields.tags + "." + Fields.tags_level0, - Fields.tags + "." + Fields.tags_level1, - Fields.tags + "." + Fields.tags_level2, - Fields.tags + "." + Fields.tags_level3, - Fields.collections + "." + Fields.collections_display_name, - Fields.collections + "." + Fields.collections_key, - Fields.published + "." + Fields.display_name, - Fields.published + "." + Fields.published_description, - ] - ) + client.index(index_name).update_searchable_attributes(INDEX_SEARCHABLE_ATTRIBUTES) # Mark which attributes can be used for sorting search results: - client.index(index_name).update_sortable_attributes( - [ - Fields.display_name, - Fields.created, - Fields.modified, - Fields.last_published, - ] - ) + client.index(index_name).update_sortable_attributes(INDEX_SORTABLE_ATTRIBUTES) # Update the search ranking rules to let the (optional) "sort" parameter take precedence over keyword relevance. # cf https://www.meilisearch.com/docs/learn/core_concepts/relevancy - client.index(index_name).update_ranking_rules( - [ - "sort", - "words", - "typo", - "proximity", - "attribute", - "exactness", - ] - ) + client.index(index_name).update_ranking_rules(INDEX_RANKING_RULES) def _recurse_children(block, fn, status_cb: Callable[[str], None] | None = None) -> None: @@ -378,6 +388,32 @@ def reset_index(status_cb: Callable[[str], None] | None = None) -> None: status_cb("Index reset complete.") +def _is_index_configured(index_name: str) -> bool: + """ + Check if an index is completely configured + + Args: + index_name (str): The name of the index to check + """ + client = _get_meilisearch_client() + index = client.get_index(index_name) + settings = index.get_settings() + for k, v in ( + ("distinctAttribute", INDEX_DISTINCT_ATTRIBUTE), + ("filterableAttributes", INDEX_FILTRABLE_ATTRIBUTES), + ("searchableAttributes", INDEX_SEARCHABLE_ATTRIBUTES), + ("sortableAttributes", INDEX_SORTABLE_ATTRIBUTES), + ("rankingRules", INDEX_RANKING_RULES), + ): + setting = settings.get(k, []) + if isinstance(v, list): + v = set(v) + setting = set(setting) + if setting != v: + return False + return True + + def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable[[str], None] | None = None) -> None: """ Initialize the Meilisearch index, creating it and configuring it if it doesn't exist @@ -388,10 +424,19 @@ def init_index(status_cb: Callable[[str], None] | None = None, warn_cb: Callable warn_cb = log.warning if _index_exists(STUDIO_INDEX_NAME): - warn_cb( - "A rebuild of the index is required. Please run ./manage.py cms reindex_studio" - " --experimental [--incremental]" - ) + if _index_is_empty(STUDIO_INDEX_NAME): + warn_cb( + "The studio search index is empty. Please run ./manage.py cms reindex_studio" + " --experimental [--incremental]" + ) + return + if not _is_index_configured(STUDIO_INDEX_NAME): + warn_cb( + "A rebuild of the index is required. Please run ./manage.py cms reindex_studio" + " --experimental [--incremental]" + ) + return + status_cb("Index already exists and is configured.") return reset_index(status_cb) @@ -424,8 +469,9 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None, incremental=Fa num_courses = CourseOverview.objects.count() # Some counters so we can track our progress as indexing progresses: - num_contexts = num_courses + num_libraries - num_contexts_done = 0 # How many courses/libraries we've indexed + num_libs_skipped = len(keys_indexed) + num_contexts = num_courses + num_libraries + num_libs_skipped + num_contexts_done = 0 + num_libs_skipped # How many courses/libraries we've indexed num_blocks_done = 0 # How many individual components/XBlocks we've indexed status_cb(f"Found {num_courses} courses, {num_libraries} libraries.") diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py index 6c46afb22b44..51b685e456b4 100644 --- a/openedx/core/djangoapps/content/search/tests/test_api.py +++ b/openedx/core/djangoapps/content/search/tests/test_api.py @@ -294,6 +294,14 @@ def test_reset_meilisearch_index(self, mock_meilisearch): @override_settings(MEILISEARCH_ENABLED=True) def test_init_meilisearch_index(self, mock_meilisearch): + # Test index already exists + api.init_index() + mock_meilisearch.return_value.swap_indexes.assert_not_called() + mock_meilisearch.return_value.create_index.assert_not_called() + mock_meilisearch.return_value.delete_index.assert_not_called() + + # Test index already exists and has no documents + mock_meilisearch.return_value.get_stats.return_value = 0 api.init_index() mock_meilisearch.return_value.swap_indexes.assert_not_called() mock_meilisearch.return_value.create_index.assert_not_called()