Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SAT-29018] Fix/corrupted RA blocks content streaming #6064

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES/5725.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
On a request for on-demand content in the content app, a corrupted Remote that
contains the wrong binary (for that content) prevented other Remotes from being
attempted on future requests. Now the last failed Remotes are temporarily ignored
and others may be picked.
24 changes: 20 additions & 4 deletions pulp_file/pytest_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ def file_fixtures_root(tmp_path):

@pytest.fixture
def write_3_iso_file_fixture_data_factory(file_fixtures_root):
def _write_3_iso_file_fixture_data_factory(name, overwrite=False):
def _write_3_iso_file_fixture_data_factory(name, overwrite=False, seed=None):
file_fixtures_root.joinpath(name).mkdir(exist_ok=overwrite)
file1 = generate_iso(file_fixtures_root.joinpath(f"{name}/1.iso"))
file2 = generate_iso(file_fixtures_root.joinpath(f"{name}/2.iso"))
file3 = generate_iso(file_fixtures_root.joinpath(f"{name}/3.iso"))
file1 = generate_iso(file_fixtures_root.joinpath(f"{name}/1.iso"), seed=seed)
file2 = generate_iso(file_fixtures_root.joinpath(f"{name}/2.iso"), seed=seed)
file3 = generate_iso(file_fixtures_root.joinpath(f"{name}/3.iso"), seed=seed)
generate_manifest(
file_fixtures_root.joinpath(f"{name}/PULP_MANIFEST"), [file1, file2, file3]
)
Expand Down Expand Up @@ -364,3 +364,19 @@ def _wget_recursive_download_on_host(url, destination):
)

return _wget_recursive_download_on_host


@pytest.fixture
def generate_server_and_remote(
file_bindings, gen_fixture_server, file_fixtures_root, gen_object_with_cleanup
):
def _generate_server_and_remote(*, manifest_path, policy):
server = gen_fixture_server(file_fixtures_root, None)
url = server.make_url(manifest_path)
remote = gen_object_with_cleanup(
file_bindings.RemotesFileApi,
{"name": str(uuid.uuid4()), "url": str(url), "policy": policy},
)
return server, remote

yield _generate_server_and_remote
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved from test_acs to facilitate creating an ACS + server.
I need an ACS here because it associated RA has priority on content-streaming.

16 changes: 0 additions & 16 deletions pulp_file/tests/functional/api/test_acs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,6 @@
)


@pytest.fixture
def generate_server_and_remote(
file_bindings, gen_fixture_server, file_fixtures_root, gen_object_with_cleanup
):
def _generate_server_and_remote(*, manifest_path, policy):
server = gen_fixture_server(file_fixtures_root, None)
url = server.make_url(manifest_path)
remote = gen_object_with_cleanup(
file_bindings.RemotesFileApi,
{"name": str(uuid.uuid4()), "url": str(url), "policy": policy},
)
return server, remote

yield _generate_server_and_remote


@pytest.mark.parallel
def test_acs_validation_and_update(
file_bindings,
Expand Down
18 changes: 18 additions & 0 deletions pulpcore/app/migrations/0126_remoteartifact_failed_at.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.2.16 on 2024-11-27 15:06

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('core', '0125_openpgpdistribution_openpgpkeyring_openpgppublickey_and_more'),
]

operations = [
migrations.AddField(
model_name='remoteartifact',
name='failed_at',
field=models.DateTimeField(null=True),
),
]
2 changes: 2 additions & 0 deletions pulpcore/app/models/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,7 @@ class RemoteArtifact(BaseModel, QueryMixin):
sha256 (models.CharField): The expected SHA-256 checksum of the file.
sha384 (models.CharField): The expected SHA-384 checksum of the file.
sha512 (models.CharField): The expected SHA-512 checksum of the file.
failed_at (models.DateTimeField): The datetime of last download attempt failure.

Relations:

Expand All @@ -721,6 +722,7 @@ class RemoteArtifact(BaseModel, QueryMixin):
sha256 = models.CharField(max_length=64, null=True, db_index=True)
sha384 = models.CharField(max_length=96, null=True, db_index=True)
sha512 = models.CharField(max_length=128, null=True, db_index=True)
failed_at = models.DateTimeField(null=True)

content_artifact = models.ForeignKey(ContentArtifact, on_delete=models.CASCADE)
remote = models.ForeignKey("Remote", on_delete=models.CASCADE)
Expand Down
5 changes: 5 additions & 0 deletions pulpcore/app/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,11 @@
"EXPIRES_TTL": 600, # 10 minutes
}

# The time a RemoteArtifact will be ignored after failure.
# In on-demand, if a fetching content from a remote failed due to corrupt data,
# the corresponding RemoteArtifact will be ignored for that time (seconds).
REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN = 5 * 60 # 5 minutes

SPECTACULAR_SETTINGS = {
"SERVE_URLCONF": ROOT_URLCONF,
"DEFAULT_GENERATOR_CLASS": "pulpcore.openapi.PulpSchemaGenerator",
Expand Down
35 changes: 24 additions & 11 deletions pulpcore/content/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import socket
import struct
from gettext import gettext as _
from datetime import timedelta

from aiohttp.client_exceptions import ClientResponseError, ClientConnectionError
from aiohttp.web import FileResponse, StreamResponse, HTTPOk
Expand All @@ -22,6 +23,7 @@
from asgiref.sync import sync_to_async

import django
from django.utils import timezone

from pulpcore.constants import STORAGE_RESPONSE_MAP
from pulpcore.responses import ArtifactResponse
Expand Down Expand Up @@ -820,22 +822,25 @@ async def _stream_content_artifact(self, request, response, content_artifact):
[pulpcore.plugin.models.ContentArtifact][] returned the binary data needed for
the client.
"""
# We should only retry with exceptions that happen before we receive any data
# We should only skip exceptions that happen before we receive any data
# and start streaming, as we can't rollback data if something happens after that.
RETRYABLE_EXCEPTIONS = (
SKIPPABLE_EXCEPTIONS = (
ClientResponseError,
UnsupportedDigestValidationError,
ClientConnectionError,
)

remote_artifacts = content_artifact.remoteartifact_set.select_related(
"remote"
).order_by_acs()
protection_time = settings.REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN
remote_artifacts = (
content_artifact.remoteartifact_set.select_related("remote")
.order_by_acs()
.exclude(failed_at__gte=timezone.now() - timedelta(seconds=protection_time))
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I though it would be nice to start simple, but maybe this threshold could depend on the content size? Or be configurable via settings?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Configurable is probably better

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mdellweg On the first approach that was a constant, but the "proper" value is subjective.

async for remote_artifact in remote_artifacts:
try:
response = await self._stream_remote_artifact(request, response, remote_artifact)
return response
except RETRYABLE_EXCEPTIONS as e:
except SKIPPABLE_EXCEPTIONS as e:
log.warning(
"Could not download remote artifact at '{}': {}".format(
remote_artifact.url, str(e)
Expand Down Expand Up @@ -1142,14 +1147,22 @@ async def finalize():
extra_data={"disable_retry_list": (DigestValidationError,)}
)
except DigestValidationError:
remote_artifact.failed_at = timezone.now()
await remote_artifact.asave()
await downloader.session.close()
close_tcp_connection(request.transport._sock)
REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN = settings.REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN
raise RuntimeError(
f"We tried streaming {remote_artifact.url!r} to the client, but it "
"failed checkusm validation. "
"At this point, we cant recover from wrong data already sent, "
"so we are forcing the connection to close. "
"If this error persists, the remote server might be corrupted."
f"Pulp tried streaming {remote_artifact.url!r} to "
"the client, but it failed checksum validation.\n\n"
"We can't recover from wrong data already sent so we are:\n"
"- Forcing the connection to close.\n"
"- Marking this Remote to be ignored for "
f"{REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN=}s.\n\n"
"If the Remote is known to be fixed, try resyncing the associated repository.\n"
"If the Remote is known to be permanently corrupted, try removing "
"affected Pulp Remote, adding a good one and resyncing.\n"
"If the problem persists, please contact the Pulp team."
pedro-psb marked this conversation as resolved.
Show resolved Hide resolved
)

if content_length := response.headers.get("Content-Length"):
Expand Down
121 changes: 108 additions & 13 deletions pulpcore/tests/functional/api/using_plugin/test_content_delivery.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
"""Tests related to content delivery."""

from aiohttp.client_exceptions import ClientResponseError, ClientPayloadError
import hashlib
import pytest
import subprocess
import uuid
from urllib.parse import urljoin

from pulpcore.client.pulp_file import (
RepositorySyncURL,
)
import pytest
from aiohttp.client_exceptions import ClientPayloadError, ClientResponseError

from pulpcore.client.pulp_file import RepositorySyncURL
from pulpcore.tests.functional.utils import download_file, get_files_in_manifest


Expand Down Expand Up @@ -116,8 +115,13 @@ def test_remote_content_changed_with_on_demand(
):
"""
GIVEN a remote synced on demand with fileA (e.g, digest=123),
WHEN on the remote server, fileA changed its content (e.g, digest=456),
THEN retrieving fileA from the content app will cause a connection-close/incomplete-response.
AND the remote server, fileA changed its content (e.g, digest=456),

WHEN the client first requests that content
THEN the content app will start a response but close the connection before finishing

WHEN the client requests that content again (within the RA cooldown interval)
THEN the content app will return a 404
"""
# GIVEN
basic_manifest_path = write_3_iso_file_fixture_data_factory("basic")
Expand All @@ -129,17 +133,108 @@ def test_remote_content_changed_with_on_demand(
repo = file_bindings.RepositoriesFileApi.read(file_repo_with_auto_publish.pulp_href)
distribution = file_distribution_factory(repository=repo.pulp_href)
expected_file_list = list(get_files_in_manifest(remote.url))

# WHEN
write_3_iso_file_fixture_data_factory("basic", overwrite=True)

# THEN
get_url = urljoin(distribution.base_url, expected_file_list[0][0])
with pytest.raises(ClientPayloadError, match="Response payload is not completed"):
download_file(get_url)

# Assert again with curl just to be sure.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reason:

This is testing the close-connection on digest validation.
The implementation on this PR cause the second request (from curl) gets a 404 instead, because the only (and corrupted) RA is temporarily ignored.

Here I wanted to test different clients, but testing only curl is enough.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we added another call to curl right away, we would expect a different error, because the "only" RA is corrupted. right?
Would it be easy to add that to this test?

# WHEN (first request)
result = subprocess.run(["curl", "-v", get_url], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# THEN
assert result.returncode == 18
assert b"* Closing connection 0" in result.stderr
assert b"curl: (18) transfer closed with outstanding read data remaining" in result.stderr

# WHEN (second request)
result = subprocess.run(["curl", "-v", get_url], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# THEN
assert result.returncode == 0
assert b"< HTTP/1.1 404 Not Found" in result.stderr


@pytest.mark.parallel
def test_handling_remote_artifact_on_demand_streaming_failure(
write_3_iso_file_fixture_data_factory,
file_repo_with_auto_publish,
file_remote_factory,
file_bindings,
monitor_task,
monitor_task_group,
file_distribution_factory,
gen_object_with_cleanup,
generate_server_and_remote,
):
"""
GIVEN A content synced with on-demand which has 2 RemoteArtifacts (Remote + ACS).
AND Only the ACS RemoteArtifact (that has priority on the content-app) is corrupted

WHEN a client requests the content for the first time
THEN the client doesnt get any content

WHEN a client requests the content for the second time
THEN the client gets the right content
"""

# Plumbing
def create_simple_remote(manifest_path):
remote = file_remote_factory(manifest_path=manifest_path, policy="on_demand")
body = RepositorySyncURL(remote=remote.pulp_href)
monitor_task(
file_bindings.RepositoriesFileApi.sync(file_repo_with_auto_publish.pulp_href, body).task
)
return remote

def create_acs_remote(manifest_path):
acs_server, acs_remote = generate_server_and_remote(
manifest_path=manifest_path, policy="on_demand"
)
acs = gen_object_with_cleanup(
file_bindings.AcsFileApi,
{"remote": acs_remote.pulp_href, "paths": [], "name": str(uuid.uuid4())},
)
monitor_task_group(file_bindings.AcsFileApi.refresh(acs.pulp_href).task_group)
return acs

def sync_publish_and_distribute(remote):
body = RepositorySyncURL(remote=remote.pulp_href)
monitor_task(
file_bindings.RepositoriesFileApi.sync(file_repo_with_auto_publish.pulp_href, body).task
)
repo = file_bindings.RepositoriesFileApi.read(file_repo_with_auto_publish.pulp_href)
distribution = file_distribution_factory(repository=repo.pulp_href)
return distribution

def refresh_acs(acs):
monitor_task_group(file_bindings.AcsFileApi.refresh(acs.pulp_href).task_group)
return acs

def get_original_content_info(remote):
expected_files = get_files_in_manifest(remote.url)
content_unit = list(expected_files)[0]
return content_unit[0], content_unit[1]

def download_from_distribution(content, distribution):
content_unit_url = urljoin(distribution.base_url, content_name)
downloaded_file = download_file(content_unit_url)
actual_checksum = hashlib.sha256(downloaded_file.body).hexdigest()
return actual_checksum

# GIVEN
basic_manifest_path = write_3_iso_file_fixture_data_factory("basic", seed=123)
acs_manifest_path = write_3_iso_file_fixture_data_factory("acs", seed=123)
remote = create_simple_remote(basic_manifest_path)
distribution = sync_publish_and_distribute(remote)
acs = create_acs_remote(acs_manifest_path)
refresh_acs(acs)
write_3_iso_file_fixture_data_factory("acs", overwrite=True) # corrupt

# WHEN/THEN (first request)
content_name, expected_checksum = get_original_content_info(remote)

with pytest.raises(ClientPayloadError, match="Response payload is not completed"):
download_from_distribution(content_name, distribution)

# WHEN/THEN (second request)
actual_checksum = download_from_distribution(content_name, distribution)
assert actual_checksum == expected_checksum
9 changes: 7 additions & 2 deletions pulpcore/tests/functional/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import asyncio
import hashlib
import os
import random

from aiohttp import web
from dataclasses import dataclass
Expand Down Expand Up @@ -103,10 +104,14 @@ async def _download_file(url, auth=None, headers=None):
return MockDownload(body=await response.read(), response_obj=response)


def generate_iso(full_path, size=1024, relative_path=None):
def generate_iso(full_path, size=1024, relative_path=None, seed=None):
"""Generate a random file."""
with open(full_path, "wb") as fout:
contents = os.urandom(size)
if seed:
random.seed(seed)
contents = random.randbytes(size)
else:
contents = os.urandom(size)
Copy link
Member Author

@pedro-psb pedro-psb Nov 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reason:

Facilitate creating two different files with same content to exercise corruption scenarios.

fout.write(contents)
fout.flush()
digest = hashlib.sha256(contents).hexdigest()
Expand Down
Loading