-
Notifications
You must be signed in to change notification settings - Fork 116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SAT-29018] Fix/corrupted RA blocks content streaming #6064
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
On a request for on-demand content in the content app, a corrupted Remote that | ||
contains the wrong binary (for that content) prevented other Remotes from being | ||
attempted on future requests. Now the last failed Remotes are temporarily ignored | ||
and others may be picked. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Generated by Django 4.2.16 on 2024-11-27 15:06 | ||
|
||
from django.db import migrations, models | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('core', '0125_openpgpdistribution_openpgpkeyring_openpgppublickey_and_more'), | ||
] | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name='remoteartifact', | ||
name='failed_at', | ||
field=models.DateTimeField(null=True), | ||
), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
import socket | ||
import struct | ||
from gettext import gettext as _ | ||
from datetime import timedelta | ||
|
||
from aiohttp.client_exceptions import ClientResponseError, ClientConnectionError | ||
from aiohttp.web import FileResponse, StreamResponse, HTTPOk | ||
|
@@ -22,6 +23,7 @@ | |
from asgiref.sync import sync_to_async | ||
|
||
import django | ||
from django.utils import timezone | ||
|
||
from pulpcore.constants import STORAGE_RESPONSE_MAP | ||
from pulpcore.responses import ArtifactResponse | ||
|
@@ -820,22 +822,25 @@ async def _stream_content_artifact(self, request, response, content_artifact): | |
[pulpcore.plugin.models.ContentArtifact][] returned the binary data needed for | ||
the client. | ||
""" | ||
# We should only retry with exceptions that happen before we receive any data | ||
# We should only skip exceptions that happen before we receive any data | ||
# and start streaming, as we can't rollback data if something happens after that. | ||
RETRYABLE_EXCEPTIONS = ( | ||
SKIPPABLE_EXCEPTIONS = ( | ||
ClientResponseError, | ||
UnsupportedDigestValidationError, | ||
ClientConnectionError, | ||
) | ||
|
||
remote_artifacts = content_artifact.remoteartifact_set.select_related( | ||
"remote" | ||
).order_by_acs() | ||
protection_time = settings.REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN | ||
remote_artifacts = ( | ||
content_artifact.remoteartifact_set.select_related("remote") | ||
.order_by_acs() | ||
.exclude(failed_at__gte=timezone.now() - timedelta(seconds=protection_time)) | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I though it would be nice to start simple, but maybe this threshold could depend on the content size? Or be configurable via settings? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Configurable is probably better There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mdellweg On the first approach that was a constant, but the "proper" value is subjective. |
||
async for remote_artifact in remote_artifacts: | ||
try: | ||
response = await self._stream_remote_artifact(request, response, remote_artifact) | ||
return response | ||
except RETRYABLE_EXCEPTIONS as e: | ||
except SKIPPABLE_EXCEPTIONS as e: | ||
log.warning( | ||
"Could not download remote artifact at '{}': {}".format( | ||
remote_artifact.url, str(e) | ||
|
@@ -1142,14 +1147,22 @@ async def finalize(): | |
extra_data={"disable_retry_list": (DigestValidationError,)} | ||
) | ||
except DigestValidationError: | ||
remote_artifact.failed_at = timezone.now() | ||
await remote_artifact.asave() | ||
await downloader.session.close() | ||
close_tcp_connection(request.transport._sock) | ||
REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN = settings.REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN | ||
raise RuntimeError( | ||
f"We tried streaming {remote_artifact.url!r} to the client, but it " | ||
"failed checkusm validation. " | ||
"At this point, we cant recover from wrong data already sent, " | ||
"so we are forcing the connection to close. " | ||
"If this error persists, the remote server might be corrupted." | ||
f"Pulp tried streaming {remote_artifact.url!r} to " | ||
"the client, but it failed checksum validation.\n\n" | ||
"We can't recover from wrong data already sent so we are:\n" | ||
"- Forcing the connection to close.\n" | ||
"- Marking this Remote to be ignored for " | ||
f"{REMOTE_CONTENT_FETCH_FAILURE_COOLDOWN=}s.\n\n" | ||
"If the Remote is known to be fixed, try resyncing the associated repository.\n" | ||
"If the Remote is known to be permanently corrupted, try removing " | ||
"affected Pulp Remote, adding a good one and resyncing.\n" | ||
"If the problem persists, please contact the Pulp team." | ||
pedro-psb marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
|
||
if content_length := response.headers.get("Content-Length"): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,15 +1,14 @@ | ||
"""Tests related to content delivery.""" | ||
|
||
from aiohttp.client_exceptions import ClientResponseError, ClientPayloadError | ||
import hashlib | ||
import pytest | ||
import subprocess | ||
import uuid | ||
from urllib.parse import urljoin | ||
|
||
from pulpcore.client.pulp_file import ( | ||
RepositorySyncURL, | ||
) | ||
import pytest | ||
from aiohttp.client_exceptions import ClientPayloadError, ClientResponseError | ||
|
||
from pulpcore.client.pulp_file import RepositorySyncURL | ||
from pulpcore.tests.functional.utils import download_file, get_files_in_manifest | ||
|
||
|
||
|
@@ -116,8 +115,13 @@ def test_remote_content_changed_with_on_demand( | |
): | ||
""" | ||
GIVEN a remote synced on demand with fileA (e.g, digest=123), | ||
WHEN on the remote server, fileA changed its content (e.g, digest=456), | ||
THEN retrieving fileA from the content app will cause a connection-close/incomplete-response. | ||
AND the remote server, fileA changed its content (e.g, digest=456), | ||
|
||
WHEN the client first requests that content | ||
THEN the content app will start a response but close the connection before finishing | ||
|
||
WHEN the client requests that content again (within the RA cooldown interval) | ||
THEN the content app will return a 404 | ||
""" | ||
# GIVEN | ||
basic_manifest_path = write_3_iso_file_fixture_data_factory("basic") | ||
|
@@ -129,17 +133,108 @@ def test_remote_content_changed_with_on_demand( | |
repo = file_bindings.RepositoriesFileApi.read(file_repo_with_auto_publish.pulp_href) | ||
distribution = file_distribution_factory(repository=repo.pulp_href) | ||
expected_file_list = list(get_files_in_manifest(remote.url)) | ||
|
||
# WHEN | ||
write_3_iso_file_fixture_data_factory("basic", overwrite=True) | ||
|
||
# THEN | ||
get_url = urljoin(distribution.base_url, expected_file_list[0][0]) | ||
with pytest.raises(ClientPayloadError, match="Response payload is not completed"): | ||
download_file(get_url) | ||
|
||
# Assert again with curl just to be sure. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reason: This is testing the close-connection on digest validation. Here I wanted to test different clients, but testing only curl is enough. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we added another call to curl right away, we would expect a different error, because the "only" RA is corrupted. right? |
||
# WHEN (first request) | ||
result = subprocess.run(["curl", "-v", get_url], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
|
||
# THEN | ||
assert result.returncode == 18 | ||
assert b"* Closing connection 0" in result.stderr | ||
assert b"curl: (18) transfer closed with outstanding read data remaining" in result.stderr | ||
|
||
# WHEN (second request) | ||
result = subprocess.run(["curl", "-v", get_url], stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
|
||
# THEN | ||
assert result.returncode == 0 | ||
assert b"< HTTP/1.1 404 Not Found" in result.stderr | ||
|
||
|
||
@pytest.mark.parallel | ||
def test_handling_remote_artifact_on_demand_streaming_failure( | ||
write_3_iso_file_fixture_data_factory, | ||
file_repo_with_auto_publish, | ||
file_remote_factory, | ||
file_bindings, | ||
monitor_task, | ||
monitor_task_group, | ||
file_distribution_factory, | ||
gen_object_with_cleanup, | ||
generate_server_and_remote, | ||
): | ||
""" | ||
GIVEN A content synced with on-demand which has 2 RemoteArtifacts (Remote + ACS). | ||
AND Only the ACS RemoteArtifact (that has priority on the content-app) is corrupted | ||
|
||
WHEN a client requests the content for the first time | ||
THEN the client doesnt get any content | ||
|
||
WHEN a client requests the content for the second time | ||
THEN the client gets the right content | ||
""" | ||
|
||
# Plumbing | ||
def create_simple_remote(manifest_path): | ||
remote = file_remote_factory(manifest_path=manifest_path, policy="on_demand") | ||
body = RepositorySyncURL(remote=remote.pulp_href) | ||
monitor_task( | ||
file_bindings.RepositoriesFileApi.sync(file_repo_with_auto_publish.pulp_href, body).task | ||
) | ||
return remote | ||
|
||
def create_acs_remote(manifest_path): | ||
acs_server, acs_remote = generate_server_and_remote( | ||
manifest_path=manifest_path, policy="on_demand" | ||
) | ||
acs = gen_object_with_cleanup( | ||
file_bindings.AcsFileApi, | ||
{"remote": acs_remote.pulp_href, "paths": [], "name": str(uuid.uuid4())}, | ||
) | ||
monitor_task_group(file_bindings.AcsFileApi.refresh(acs.pulp_href).task_group) | ||
return acs | ||
|
||
def sync_publish_and_distribute(remote): | ||
body = RepositorySyncURL(remote=remote.pulp_href) | ||
monitor_task( | ||
file_bindings.RepositoriesFileApi.sync(file_repo_with_auto_publish.pulp_href, body).task | ||
) | ||
repo = file_bindings.RepositoriesFileApi.read(file_repo_with_auto_publish.pulp_href) | ||
distribution = file_distribution_factory(repository=repo.pulp_href) | ||
return distribution | ||
|
||
def refresh_acs(acs): | ||
monitor_task_group(file_bindings.AcsFileApi.refresh(acs.pulp_href).task_group) | ||
return acs | ||
|
||
def get_original_content_info(remote): | ||
expected_files = get_files_in_manifest(remote.url) | ||
content_unit = list(expected_files)[0] | ||
return content_unit[0], content_unit[1] | ||
|
||
def download_from_distribution(content, distribution): | ||
content_unit_url = urljoin(distribution.base_url, content_name) | ||
downloaded_file = download_file(content_unit_url) | ||
actual_checksum = hashlib.sha256(downloaded_file.body).hexdigest() | ||
return actual_checksum | ||
|
||
# GIVEN | ||
basic_manifest_path = write_3_iso_file_fixture_data_factory("basic", seed=123) | ||
acs_manifest_path = write_3_iso_file_fixture_data_factory("acs", seed=123) | ||
remote = create_simple_remote(basic_manifest_path) | ||
distribution = sync_publish_and_distribute(remote) | ||
acs = create_acs_remote(acs_manifest_path) | ||
refresh_acs(acs) | ||
write_3_iso_file_fixture_data_factory("acs", overwrite=True) # corrupt | ||
|
||
# WHEN/THEN (first request) | ||
content_name, expected_checksum = get_original_content_info(remote) | ||
|
||
with pytest.raises(ClientPayloadError, match="Response payload is not completed"): | ||
download_from_distribution(content_name, distribution) | ||
|
||
# WHEN/THEN (second request) | ||
actual_checksum = download_from_distribution(content_name, distribution) | ||
assert actual_checksum == expected_checksum |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
import asyncio | ||
import hashlib | ||
import os | ||
import random | ||
|
||
from aiohttp import web | ||
from dataclasses import dataclass | ||
|
@@ -103,10 +104,14 @@ async def _download_file(url, auth=None, headers=None): | |
return MockDownload(body=await response.read(), response_obj=response) | ||
|
||
|
||
def generate_iso(full_path, size=1024, relative_path=None): | ||
def generate_iso(full_path, size=1024, relative_path=None, seed=None): | ||
"""Generate a random file.""" | ||
with open(full_path, "wb") as fout: | ||
contents = os.urandom(size) | ||
if seed: | ||
random.seed(seed) | ||
contents = random.randbytes(size) | ||
else: | ||
contents = os.urandom(size) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reason: Facilitate creating two different files with same content to exercise corruption scenarios. |
||
fout.write(contents) | ||
fout.flush() | ||
digest = hashlib.sha256(contents).hexdigest() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved from
test_acs
to facilitate creating an ACS + server.I need an ACS here because it associated RA has priority on content-streaming.