Skip to content

Commit

Permalink
feat: Upload to AI Search in Batches (#1323)
Browse files Browse the repository at this point in the history
  • Loading branch information
cmw2 authored Sep 26, 2024
1 parent c1cb24e commit 5d0a5d6
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 17 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
max-line-length = 88
extend-ignore = E501
exclude = .venv
ignore = E203, W503
18 changes: 12 additions & 6 deletions code/backend/batch/utilities/helpers/embedders/push_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,18 @@ def __embed(
for document in documents:
documents_to_upload.append(self.__convert_to_search_document(document))

response = self.azure_search_helper.get_search_client().upload_documents(
documents_to_upload
)
if not all([r.succeeded for r in response]):
logger.error("Failed to upload documents to search index")
raise Exception(response)
# Upload documents (which are chunks) to search index in batches
if documents_to_upload:
batch_size = self.env_helper.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE
search_client = self.azure_search_helper.get_search_client()
for i in range(0, len(documents_to_upload), batch_size):
batch = documents_to_upload[i : i + batch_size]
response = search_client.upload_documents(batch)
if not all(r.succeeded for r in response if response):
logger.error("Failed to upload documents to search index")
raise RuntimeError(f"Upload failed for some documents: {response}")
else:
logger.warning("No documents to upload.")

def __generate_image_caption(self, source_url):
model = self.env_helper.AZURE_OPENAI_VISION_MODEL
Expand Down
3 changes: 3 additions & 0 deletions code/backend/batch/utilities/helpers/env_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ def __load_config(self, **kwargs) -> None:
self.AZURE_SEARCH_CONVERSATIONS_LOG_INDEX = os.getenv(
"AZURE_SEARCH_CONVERSATIONS_LOG_INDEX", "conversations"
)
self.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = os.getenv(
"AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE", 100
)
# Integrated Vectorization
self.AZURE_SEARCH_DATASOURCE_NAME = os.getenv(
"AZURE_SEARCH_DATASOURCE_NAME", ""
Expand Down
72 changes: 61 additions & 11 deletions code/tests/utilities/helpers/test_push_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG = "default"
AZURE_SEARCH_CONVERSATIONS_LOG_INDEX = "mock-log-index"
USE_ADVANCED_IMAGE_PROCESSING = False
AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = 100


@pytest.fixture(autouse=True)
Expand All @@ -49,7 +50,9 @@ def llm_helper_mock():

@pytest.fixture(autouse=True)
def env_helper_mock():
with patch("backend.batch.utilities.helpers.embedders.push_embedder.EnvHelper") as mock:
with patch(
"backend.batch.utilities.helpers.embedders.push_embedder.EnvHelper"
) as mock:
env_helper = mock.return_value
env_helper.AZURE_AUTH_TYPE = AZURE_AUTH_TYPE
env_helper.AZURE_SEARCH_KEY = AZURE_SEARCH_KEY
Expand All @@ -58,7 +61,9 @@ def env_helper_mock():
env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH = AZURE_SEARCH_USE_SEMANTIC_SEARCH
env_helper.AZURE_SEARCH_FIELDS_ID = AZURE_SEARCH_FIELDS_ID
env_helper.AZURE_SEARCH_CONTENT_COLUMN = AZURE_SEARCH_CONTENT_COLUMN
env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN = AZURE_SEARCH_CONTENT_VECTOR_COLUMN
env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN = (
AZURE_SEARCH_CONTENT_VECTOR_COLUMN
)
env_helper.AZURE_SEARCH_TITLE_COLUMN = AZURE_SEARCH_TITLE_COLUMN
env_helper.AZURE_SEARCH_FIELDS_METADATA = AZURE_SEARCH_FIELDS_METADATA
env_helper.AZURE_SEARCH_SOURCE_COLUMN = AZURE_SEARCH_SOURCE_COLUMN
Expand All @@ -73,6 +78,9 @@ def env_helper_mock():

env_helper.USE_ADVANCED_IMAGE_PROCESSING = USE_ADVANCED_IMAGE_PROCESSING
env_helper.is_auth_type_keys.return_value = True
env_helper.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = (
AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE
)
yield env_helper


Expand Down Expand Up @@ -291,7 +299,10 @@ def test_embed_file_advanced_image_processing_raises_exception_on_failure(


def test_embed_file_use_advanced_image_processing_does_not_vectorize_image_if_unsupported(
azure_computer_vision_mock, mock_config_helper, azure_search_helper_mock, env_helper_mock
azure_computer_vision_mock,
mock_config_helper,
azure_search_helper_mock,
env_helper_mock,
):
# given
mock_config_helper.document_processors = [
Expand Down Expand Up @@ -331,7 +342,9 @@ def test_embed_file_loads_documents(document_loading_mock, env_helper_mock):
)


def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mock, env_helper_mock):
def test_embed_file_chunks_documents(
document_loading_mock, document_chunking_mock, env_helper_mock
):
# given
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)

Expand All @@ -347,7 +360,9 @@ def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mo
)


def test_embed_file_chunks_documents_upper_case(document_loading_mock, document_chunking_mock, env_helper_mock):
def test_embed_file_chunks_documents_upper_case(
document_loading_mock, document_chunking_mock, env_helper_mock
):
# given
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)

Expand All @@ -363,7 +378,9 @@ def test_embed_file_chunks_documents_upper_case(document_loading_mock, document_
)


def test_embed_file_generates_embeddings_for_documents(llm_helper_mock, env_helper_mock):
def test_embed_file_generates_embeddings_for_documents(
llm_helper_mock, env_helper_mock
):
# given
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)

Expand All @@ -382,7 +399,8 @@ def test_embed_file_generates_embeddings_for_documents(llm_helper_mock, env_help
def test_embed_file_stores_documents_in_search_index(
document_chunking_mock,
llm_helper_mock,
azure_search_helper_mock: MagicMock, env_helper_mock
azure_search_helper_mock: MagicMock,
env_helper_mock,
):
# given
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)
Expand All @@ -404,10 +422,14 @@ def test_embed_file_stores_documents_in_search_index(
AZURE_SEARCH_FIELDS_METADATA: json.dumps(
{
AZURE_SEARCH_FIELDS_ID: expected_chunked_documents[0].id,
AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[0].source,
AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[
0
].source,
AZURE_SEARCH_TITLE_COLUMN: expected_chunked_documents[0].title,
AZURE_SEARCH_CHUNK_COLUMN: expected_chunked_documents[0].chunk,
AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[0].offset,
AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[
0
].offset,
"page_number": expected_chunked_documents[0].page_number,
"chunk_id": expected_chunked_documents[0].chunk_id,
}
Expand All @@ -424,10 +446,14 @@ def test_embed_file_stores_documents_in_search_index(
AZURE_SEARCH_FIELDS_METADATA: json.dumps(
{
AZURE_SEARCH_FIELDS_ID: expected_chunked_documents[1].id,
AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[1].source,
AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[
1
].source,
AZURE_SEARCH_TITLE_COLUMN: expected_chunked_documents[1].title,
AZURE_SEARCH_CHUNK_COLUMN: expected_chunked_documents[1].chunk,
AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[1].offset,
AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[
1
].offset,
"page_number": expected_chunked_documents[1].page_number,
"chunk_id": expected_chunked_documents[1].chunk_id,
}
Expand All @@ -441,6 +467,30 @@ def test_embed_file_stores_documents_in_search_index(
)


def test_embed_file_stores_documents_in_search_index_in_batches(
document_chunking_mock,
llm_helper_mock,
azure_search_helper_mock: MagicMock,
env_helper_mock,
):
# given
env_helper_mock.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = 1
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)

# when
push_embedder.embed_file(
"some-url",
"some-file-name.pdf",
)

# then
azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.assert_called()
assert (
azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.call_count
== 2
)


def test_embed_file_raises_exception_on_failure(
azure_search_helper_mock,
):
Expand Down

0 comments on commit 5d0a5d6

Please sign in to comment.