From 5d0a5d673d8ba26e5079ada76c98fbf54c106143 Mon Sep 17 00:00:00 2001 From: Chris Westbrook Date: Thu, 26 Sep 2024 06:45:13 -0400 Subject: [PATCH] feat: Upload to AI Search in Batches (#1323) --- .flake8 | 1 + .../helpers/embedders/push_embedder.py | 18 +++-- .../batch/utilities/helpers/env_helper.py | 3 + .../utilities/helpers/test_push_embedder.py | 72 ++++++++++++++++--- 4 files changed, 77 insertions(+), 17 deletions(-) diff --git a/.flake8 b/.flake8 index bdf2be566..1619f6901 100644 --- a/.flake8 +++ b/.flake8 @@ -2,3 +2,4 @@ max-line-length = 88 extend-ignore = E501 exclude = .venv +ignore = E203, W503 diff --git a/code/backend/batch/utilities/helpers/embedders/push_embedder.py b/code/backend/batch/utilities/helpers/embedders/push_embedder.py index 2cec6520b..a1cff59cc 100644 --- a/code/backend/batch/utilities/helpers/embedders/push_embedder.py +++ b/code/backend/batch/utilities/helpers/embedders/push_embedder.py @@ -79,12 +79,18 @@ def __embed( for document in documents: documents_to_upload.append(self.__convert_to_search_document(document)) - response = self.azure_search_helper.get_search_client().upload_documents( - documents_to_upload - ) - if not all([r.succeeded for r in response]): - logger.error("Failed to upload documents to search index") - raise Exception(response) + # Upload documents (which are chunks) to search index in batches + if documents_to_upload: + batch_size = self.env_helper.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE + search_client = self.azure_search_helper.get_search_client() + for i in range(0, len(documents_to_upload), batch_size): + batch = documents_to_upload[i : i + batch_size] + response = search_client.upload_documents(batch) + if not all(r.succeeded for r in response if response): + logger.error("Failed to upload documents to search index") + raise RuntimeError(f"Upload failed for some documents: {response}") + else: + logger.warning("No documents to upload.") def __generate_image_caption(self, source_url): model = self.env_helper.AZURE_OPENAI_VISION_MODEL diff --git a/code/backend/batch/utilities/helpers/env_helper.py b/code/backend/batch/utilities/helpers/env_helper.py index 873bce6ec..9c6974c46 100644 --- a/code/backend/batch/utilities/helpers/env_helper.py +++ b/code/backend/batch/utilities/helpers/env_helper.py @@ -78,6 +78,9 @@ def __load_config(self, **kwargs) -> None: self.AZURE_SEARCH_CONVERSATIONS_LOG_INDEX = os.getenv( "AZURE_SEARCH_CONVERSATIONS_LOG_INDEX", "conversations" ) + self.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = os.getenv( + "AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE", 100 + ) # Integrated Vectorization self.AZURE_SEARCH_DATASOURCE_NAME = os.getenv( "AZURE_SEARCH_DATASOURCE_NAME", "" diff --git a/code/tests/utilities/helpers/test_push_embedder.py b/code/tests/utilities/helpers/test_push_embedder.py index c1031a49c..cc6702bb8 100644 --- a/code/tests/utilities/helpers/test_push_embedder.py +++ b/code/tests/utilities/helpers/test_push_embedder.py @@ -27,6 +27,7 @@ AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG = "default" AZURE_SEARCH_CONVERSATIONS_LOG_INDEX = "mock-log-index" USE_ADVANCED_IMAGE_PROCESSING = False +AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = 100 @pytest.fixture(autouse=True) @@ -49,7 +50,9 @@ def llm_helper_mock(): @pytest.fixture(autouse=True) def env_helper_mock(): - with patch("backend.batch.utilities.helpers.embedders.push_embedder.EnvHelper") as mock: + with patch( + "backend.batch.utilities.helpers.embedders.push_embedder.EnvHelper" + ) as mock: env_helper = mock.return_value env_helper.AZURE_AUTH_TYPE = AZURE_AUTH_TYPE env_helper.AZURE_SEARCH_KEY = AZURE_SEARCH_KEY @@ -58,7 +61,9 @@ def env_helper_mock(): env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH = AZURE_SEARCH_USE_SEMANTIC_SEARCH env_helper.AZURE_SEARCH_FIELDS_ID = AZURE_SEARCH_FIELDS_ID env_helper.AZURE_SEARCH_CONTENT_COLUMN = AZURE_SEARCH_CONTENT_COLUMN - env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN = AZURE_SEARCH_CONTENT_VECTOR_COLUMN + env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN = ( + AZURE_SEARCH_CONTENT_VECTOR_COLUMN + ) env_helper.AZURE_SEARCH_TITLE_COLUMN = AZURE_SEARCH_TITLE_COLUMN env_helper.AZURE_SEARCH_FIELDS_METADATA = AZURE_SEARCH_FIELDS_METADATA env_helper.AZURE_SEARCH_SOURCE_COLUMN = AZURE_SEARCH_SOURCE_COLUMN @@ -73,6 +78,9 @@ def env_helper_mock(): env_helper.USE_ADVANCED_IMAGE_PROCESSING = USE_ADVANCED_IMAGE_PROCESSING env_helper.is_auth_type_keys.return_value = True + env_helper.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = ( + AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE + ) yield env_helper @@ -291,7 +299,10 @@ def test_embed_file_advanced_image_processing_raises_exception_on_failure( def test_embed_file_use_advanced_image_processing_does_not_vectorize_image_if_unsupported( - azure_computer_vision_mock, mock_config_helper, azure_search_helper_mock, env_helper_mock + azure_computer_vision_mock, + mock_config_helper, + azure_search_helper_mock, + env_helper_mock, ): # given mock_config_helper.document_processors = [ @@ -331,7 +342,9 @@ def test_embed_file_loads_documents(document_loading_mock, env_helper_mock): ) -def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mock, env_helper_mock): +def test_embed_file_chunks_documents( + document_loading_mock, document_chunking_mock, env_helper_mock +): # given push_embedder = PushEmbedder(MagicMock(), env_helper_mock) @@ -347,7 +360,9 @@ def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mo ) -def test_embed_file_chunks_documents_upper_case(document_loading_mock, document_chunking_mock, env_helper_mock): +def test_embed_file_chunks_documents_upper_case( + document_loading_mock, document_chunking_mock, env_helper_mock +): # given push_embedder = PushEmbedder(MagicMock(), env_helper_mock) @@ -363,7 +378,9 @@ def test_embed_file_chunks_documents_upper_case(document_loading_mock, document_ ) -def test_embed_file_generates_embeddings_for_documents(llm_helper_mock, env_helper_mock): +def test_embed_file_generates_embeddings_for_documents( + llm_helper_mock, env_helper_mock +): # given push_embedder = PushEmbedder(MagicMock(), env_helper_mock) @@ -382,7 +399,8 @@ def test_embed_file_generates_embeddings_for_documents(llm_helper_mock, env_help def test_embed_file_stores_documents_in_search_index( document_chunking_mock, llm_helper_mock, - azure_search_helper_mock: MagicMock, env_helper_mock + azure_search_helper_mock: MagicMock, + env_helper_mock, ): # given push_embedder = PushEmbedder(MagicMock(), env_helper_mock) @@ -404,10 +422,14 @@ def test_embed_file_stores_documents_in_search_index( AZURE_SEARCH_FIELDS_METADATA: json.dumps( { AZURE_SEARCH_FIELDS_ID: expected_chunked_documents[0].id, - AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[0].source, + AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[ + 0 + ].source, AZURE_SEARCH_TITLE_COLUMN: expected_chunked_documents[0].title, AZURE_SEARCH_CHUNK_COLUMN: expected_chunked_documents[0].chunk, - AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[0].offset, + AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[ + 0 + ].offset, "page_number": expected_chunked_documents[0].page_number, "chunk_id": expected_chunked_documents[0].chunk_id, } @@ -424,10 +446,14 @@ def test_embed_file_stores_documents_in_search_index( AZURE_SEARCH_FIELDS_METADATA: json.dumps( { AZURE_SEARCH_FIELDS_ID: expected_chunked_documents[1].id, - AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[1].source, + AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[ + 1 + ].source, AZURE_SEARCH_TITLE_COLUMN: expected_chunked_documents[1].title, AZURE_SEARCH_CHUNK_COLUMN: expected_chunked_documents[1].chunk, - AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[1].offset, + AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[ + 1 + ].offset, "page_number": expected_chunked_documents[1].page_number, "chunk_id": expected_chunked_documents[1].chunk_id, } @@ -441,6 +467,30 @@ def test_embed_file_stores_documents_in_search_index( ) +def test_embed_file_stores_documents_in_search_index_in_batches( + document_chunking_mock, + llm_helper_mock, + azure_search_helper_mock: MagicMock, + env_helper_mock, +): + # given + env_helper_mock.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = 1 + push_embedder = PushEmbedder(MagicMock(), env_helper_mock) + + # when + push_embedder.embed_file( + "some-url", + "some-file-name.pdf", + ) + + # then + azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.assert_called() + assert ( + azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.call_count + == 2 + ) + + def test_embed_file_raises_exception_on_failure( azure_search_helper_mock, ):