From cfc075a1a90d307118e0bfa5a0348fc052f678b9 Mon Sep 17 00:00:00 2001 From: SdgJlbl Date: Thu, 27 Apr 2023 12:06:42 +0200 Subject: [PATCH 1/6] fix: make BuildError serialisable by Celery in case of retry Signed-off-by: SdgJlbl --- backend/substrapp/compute_tasks/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/substrapp/compute_tasks/errors.py b/backend/substrapp/compute_tasks/errors.py index ae5235b6a..49b7c737a 100644 --- a/backend/substrapp/compute_tasks/errors.py +++ b/backend/substrapp/compute_tasks/errors.py @@ -72,7 +72,7 @@ class BuildError(_ComputeTaskError, CeleryRetryError): def __init__(self, logs: str, *args, **kwargs): self.logs = BytesIO(str.encode(logs)) - super().__init__(*args, **kwargs) + super().__init__(logs, *args, **kwargs) class ExecutionError(_ComputeTaskError, CeleryNoRetryError): From 05ed176e0ff036ae05a7ae7cc0b79ee9a627fd25 Mon Sep 17 00:00:00 2001 From: SdgJlbl Date: Thu, 27 Apr 2023 12:36:45 +0200 Subject: [PATCH 2/6] fix: do not retry non-time-out Build errors Signed-off-by: SdgJlbl --- backend/substrapp/compute_tasks/errors.py | 16 ++++++- .../substrapp/compute_tasks/image_builder.py | 44 +++++++++++-------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/backend/substrapp/compute_tasks/errors.py b/backend/substrapp/compute_tasks/errors.py index 49b7c737a..a3dea56e6 100644 --- a/backend/substrapp/compute_tasks/errors.py +++ b/backend/substrapp/compute_tasks/errors.py @@ -61,7 +61,21 @@ class _ComputeTaskError(RuntimeError): error_type: ComputeTaskErrorType -class BuildError(_ComputeTaskError, CeleryRetryError): +class BuildRetryError(_ComputeTaskError, CeleryRetryError): + """An error occurred during the build of a container image. + + Args: + logs (str): the container image build logs + """ + + error_type = ComputeTaskErrorType.BUILD_ERROR + + def __init__(self, logs: str, *args, **kwargs): + self.logs = BytesIO(str.encode(logs)) + super().__init__(logs, *args, **kwargs) + + +class BuildError(_ComputeTaskError, CeleryNoRetryError): """An error occurred during the build of a container image. Args: diff --git a/backend/substrapp/compute_tasks/image_builder.py b/backend/substrapp/compute_tasks/image_builder.py index 262d0c974..3df3a648d 100644 --- a/backend/substrapp/compute_tasks/image_builder.py +++ b/backend/substrapp/compute_tasks/image_builder.py @@ -7,6 +7,7 @@ from django.conf import settings import orchestrator +from substrapp import exceptions from substrapp.compute_tasks import errors as compute_task_errors from substrapp.compute_tasks import utils from substrapp.compute_tasks.compute_pod import Label @@ -114,6 +115,16 @@ def _get_entrypoint_from_dockerfile(dockerfile_dir: str) -> list[str]: raise compute_task_errors.BuildError("Invalid Dockerfile: Cannot find ENTRYPOINT") +def _delete_kaniko_pod(create_pod: bool, k8s_client: kubernetes.client.CoreV1Api, pod_name: str) -> str: + logs = "" + if create_pod: + logs = get_pod_logs(k8s_client, pod_name, KANIKO_CONTAINER_NAME, ignore_pod_not_found=True) + delete_pod(k8s_client, pod_name) + for line in (logs or "").split("\n"): + logger.info(line, pod_name=pod_name) + return logs + + @timeit def _build_container_image(path: str, tag: str) -> None: _assert_dockerfile_exist(path) @@ -130,35 +141,30 @@ def _build_container_image(path: str, tag: str) -> None: pod = _build_pod(path, tag) k8s_client.create_namespaced_pod(body=pod, namespace=NAMESPACE) except kubernetes.client.ApiException as e: - raise compute_task_errors.BuildError( + raise compute_task_errors.BuildRetryError( f"Error creating pod {NAMESPACE}/{pod_name}. Reason: {e.reason}, status: {e.status}, body: {e.body}" ) from e - build_exc = None - try: watch_pod(k8s_client, pod_name) except Exception as e: # In case of concurrent builds, it may fail. Check if the image exists. if container_image_exists(tag): + logger.warning( + f"Build of container image {tag} failed, probably because it was done by a concurrent build", + exc_info=True, + ) return - build_exc = e - - finally: - logs = None - - if create_pod: - logs = get_pod_logs(k8s_client, pod_name, KANIKO_CONTAINER_NAME, ignore_pod_not_found=True) - delete_pod(k8s_client, pod_name) - for line in (logs or "").split("\n"): - logger.info(line, pod_name=pod_name) - - if build_exc: - err_msg = str(build_exc) - if logs: - err_msg += "\n\n" + logs - raise compute_task_errors.BuildError(err_msg) + + logs = _delete_kaniko_pod(create_pod, k8s_client, pod_name) + + if isinstance(e, exceptions.PodTimeoutError): + raise compute_task_errors.BuildRetryError(logs) from e + else: # exceptions.PodError or other + raise compute_task_errors.BuildError(logs) from e + + _delete_kaniko_pod(create_pod, k8s_client, pod_name) def _assert_dockerfile_exist(dockerfile_path): From b957e84ed2d06758d3013a513c6aa7799fc623a4 Mon Sep 17 00:00:00 2001 From: SdgJlbl Date: Thu, 27 Apr 2023 16:37:47 +0200 Subject: [PATCH 3/6] fix: add changelog Signed-off-by: SdgJlbl --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f21f83aa..d74270cf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Fixed + +- Raise a serializable Exception so that CeleryRetryError won't crash ([#641](https://github.com/Substra/substra-backend/pull/641)) +- Do not retry on non-timeout build errors ([#641](https://github.com/Substra/substra-backend/pull/641)) + ## [0.36.1](https://github.com/Substra/substra-backend/releases/tag/0.36.1) 2023-04-21 ### Fixed From 41251f05c22cee52bbc8f865310592fe4915a63b Mon Sep 17 00:00:00 2001 From: SdgJlbl Date: Thu, 27 Apr 2023 16:38:03 +0200 Subject: [PATCH 4/6] fix: add unit tests Signed-off-by: SdgJlbl --- .../substrapp/tests/test_kubernetes_utils.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/backend/substrapp/tests/test_kubernetes_utils.py b/backend/substrapp/tests/test_kubernetes_utils.py index 8821e4bc4..044584b89 100644 --- a/backend/substrapp/tests/test_kubernetes_utils.py +++ b/backend/substrapp/tests/test_kubernetes_utils.py @@ -34,3 +34,30 @@ def test_get_service_node_port(): service.spec.ports[0].node_port = 9000 port = substrapp.kubernetes_utils.get_service_node_port("my_service") assert port == 9000 + + +def test_get_pod_logs(mocker): + mocker.patch("kubernetes.client.CoreV1Api.read_namespaced_pod_log", return_value="Super great logs") + k8s_client = kubernetes.client.CoreV1Api() + logs = substrapp.kubernetes_utils.get_pod_logs(k8s_client, "pod_name", "container_name", ignore_pod_not_found=True) + assert logs == "Super great logs" + + +def test_get_pod_logs_not_found(): + with mock.patch("kubernetes.client.CoreV1Api.read_namespaced_pod_log") as read_pod: + read_pod.side_effect = kubernetes.client.ApiException(404, "Not Found") + k8s_client = kubernetes.client.CoreV1Api() + logs = substrapp.kubernetes_utils.get_pod_logs( + k8s_client, "pod_name", "container_name", ignore_pod_not_found=True + ) + assert "Pod not found" in logs + + +def test_get_pod_logs_bad_request(): + with mock.patch("kubernetes.client.CoreV1Api.read_namespaced_pod_log") as read_pod: + read_pod.side_effect = kubernetes.client.ApiException(400, "Bad Request") + k8s_client = kubernetes.client.CoreV1Api() + logs = substrapp.kubernetes_utils.get_pod_logs( + k8s_client, "pod_name", "container_name", ignore_pod_not_found=True + ) + assert "pod_name" in logs From 51fabaf1608b270751a27ebb2ca29e8529b58fa6 Mon Sep 17 00:00:00 2001 From: SdgJlbl Date: Thu, 27 Apr 2023 16:45:27 +0200 Subject: [PATCH 5/6] fix: remove useless class in test_image_builder Signed-off-by: SdgJlbl --- .../tests/compute_tasks/test_image_builder.py | 96 ++++++++++--------- 1 file changed, 49 insertions(+), 47 deletions(-) diff --git a/backend/substrapp/tests/compute_tasks/test_image_builder.py b/backend/substrapp/tests/compute_tasks/test_image_builder.py index 6d6310626..169e133e0 100644 --- a/backend/substrapp/tests/compute_tasks/test_image_builder.py +++ b/backend/substrapp/tests/compute_tasks/test_image_builder.py @@ -22,53 +22,55 @@ """ -class TestBuildImageIfMissing: - def test_image_already_exists(self, mocker: MockerFixture, function: orchestrator.Function): - ds = mocker.Mock() - m_container_image_exists = mocker.patch( - "substrapp.compute_tasks.image_builder.container_image_exists", return_value=True - ) - function_image_tag = utils.container_image_tag_from_function(function) - - image_builder.build_image_if_missing(datastore=ds, function=function) - - m_container_image_exists.assert_called_once_with(function_image_tag) - - def test_image_build_needed(self, mocker: MockerFixture, function: orchestrator.Function): - ds = mocker.Mock() - m_container_image_exists = mocker.patch( - "substrapp.compute_tasks.image_builder.container_image_exists", return_value=False - ) - m_build_function_image = mocker.patch("substrapp.compute_tasks.image_builder._build_function_image") - function_image_tag = utils.container_image_tag_from_function(function) - - image_builder.build_image_if_missing(datastore=ds, function=function) - - m_container_image_exists.assert_called_once_with(function_image_tag) - m_build_function_image.assert_called_once() - assert m_build_function_image.call_args.args[1] == function - - -class TestGetEntrypointFromDockerfile: - def test_valid_dockerfile(self, tmp_path: pathlib.Path): - dockerfile_path = tmp_path / "Dockerfile" - dockerfile_path.write_text(_VALID_DOCKERFILE) - entrypoint = image_builder._get_entrypoint_from_dockerfile(str(tmp_path)) - - assert entrypoint == ["python3", "myfunction.py"] - - @pytest.mark.parametrize( - "dockerfile,expected_exc_content", - [ - pytest.param(_NO_ENTRYPOINT, "Invalid Dockerfile: Cannot find ENTRYPOINT", id="no entrypoint"), - pytest.param(_ENTRYPOINT_SHELL_FORM, "Invalid ENTRYPOINT", id="shell form"), - ], +def test_build_image_if_missing_image_already_exists(mocker: MockerFixture, function: orchestrator.Function): + ds = mocker.Mock() + m_container_image_exists = mocker.patch( + "substrapp.compute_tasks.image_builder.container_image_exists", return_value=True ) - def test_invalid_dockerfile(self, tmp_path: pathlib.Path, dockerfile: str, expected_exc_content: str): - dockerfile_path = tmp_path / "Dockerfile" - dockerfile_path.write_text(dockerfile) + function_image_tag = utils.container_image_tag_from_function(function) - with pytest.raises(compute_task_errors.BuildError) as exc: - image_builder._get_entrypoint_from_dockerfile(str(tmp_path)) + image_builder.build_image_if_missing(datastore=ds, function=function) - assert expected_exc_content in bytes.decode(exc.value.logs.read()) + m_container_image_exists.assert_called_once_with(function_image_tag) + + +def test_build_image_if_missing_image_build_needed(mocker: MockerFixture, function: orchestrator.Function): + ds = mocker.Mock() + m_container_image_exists = mocker.patch( + "substrapp.compute_tasks.image_builder.container_image_exists", return_value=False + ) + m_build_function_image = mocker.patch("substrapp.compute_tasks.image_builder._build_function_image") + function_image_tag = utils.container_image_tag_from_function(function) + + image_builder.build_image_if_missing(datastore=ds, function=function) + + m_container_image_exists.assert_called_once_with(function_image_tag) + m_build_function_image.assert_called_once() + assert m_build_function_image.call_args.args[1] == function + + +def test_get_entrypoint_from_dockerfile_valid_dockerfile(tmp_path: pathlib.Path): + dockerfile_path = tmp_path / "Dockerfile" + dockerfile_path.write_text(_VALID_DOCKERFILE) + entrypoint = image_builder._get_entrypoint_from_dockerfile(str(tmp_path)) + + assert entrypoint == ["python3", "myfunction.py"] + + +@pytest.mark.parametrize( + "dockerfile,expected_exc_content", + [ + pytest.param(_NO_ENTRYPOINT, "Invalid Dockerfile: Cannot find ENTRYPOINT", id="no entrypoint"), + pytest.param(_ENTRYPOINT_SHELL_FORM, "Invalid ENTRYPOINT", id="shell form"), + ], +) +def test_get_entrypoint_from_dockerfile_invalid_dockerfile( + tmp_path: pathlib.Path, dockerfile: str, expected_exc_content: str +): + dockerfile_path = tmp_path / "Dockerfile" + dockerfile_path.write_text(dockerfile) + + with pytest.raises(compute_task_errors.BuildError) as exc: + image_builder._get_entrypoint_from_dockerfile(str(tmp_path)) + + assert expected_exc_content in bytes.decode(exc.value.logs.read()) From 002ad8a27d7515055b57a5da43856ba035213776 Mon Sep 17 00:00:00 2001 From: SdgJlbl Date: Fri, 28 Apr 2023 18:07:28 +0200 Subject: [PATCH 6/6] fix: do not split kaniko logs on several lines Signed-off-by: SdgJlbl --- backend/substrapp/compute_tasks/image_builder.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/substrapp/compute_tasks/image_builder.py b/backend/substrapp/compute_tasks/image_builder.py index 3df3a648d..691788911 100644 --- a/backend/substrapp/compute_tasks/image_builder.py +++ b/backend/substrapp/compute_tasks/image_builder.py @@ -120,8 +120,7 @@ def _delete_kaniko_pod(create_pod: bool, k8s_client: kubernetes.client.CoreV1Api if create_pod: logs = get_pod_logs(k8s_client, pod_name, KANIKO_CONTAINER_NAME, ignore_pod_not_found=True) delete_pod(k8s_client, pod_name) - for line in (logs or "").split("\n"): - logger.info(line, pod_name=pod_name) + logger.info(logs or "", pod_name=pod_name) return logs