Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build: use rclone for sync #9842

Merged
merged 7 commits into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ jobs:
- checkout
- run: git submodule sync
- run: git submodule update --init
- run: sudo apt update
- run: sudo apt install -y rclone
- run: pip install --user 'tox<5'
- run: tox -e py310
- codecov/upload
Expand Down
3 changes: 2 additions & 1 deletion dockerfiles/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ RUN apt-get -y install \
netcat \
telnet \
lsb-release \
npm
npm \
rclone

# Gets the MinIO mc client used to add buckets upon initialization
# If this client should have issues running inside this image, it is also
Expand Down
19 changes: 19 additions & 0 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import cached_property
from pathlib import Path

import structlog
Expand All @@ -7,6 +8,7 @@
from storages.utils import get_available_overwrite_name, safe_join

from readthedocs.core.utils.filesystem import safe_open
from readthedocs.storage.rclone import RCloneLocal

log = structlog.get_logger(__name__)

Expand Down Expand Up @@ -153,6 +155,19 @@ def sync_directory(self, source, destination):
log.debug('Deleting file from media storage.', filepath=filepath)
self.delete(filepath)

@cached_property
def _rclone(self):
raise NotImplementedError

def rclone_sync(self, source, destination):
"""Sync a directory recursively to storage using rclone sync."""
if destination in ("", "/"):
raise SuspiciousFileOperation("Syncing all storage cannot be right")
humitos marked this conversation as resolved.
Show resolved Hide resolved

# TODO: borrow the symlink check from #9890 when merged.
# self._check_suspicious_path(source)
return self._rclone.sync(source, destination)

def join(self, directory, filepath):
return safe_join(directory, filepath)

Expand Down Expand Up @@ -187,6 +202,10 @@ def __init__(self, **kwargs):

super().__init__(location)

@cached_property
def _rclone(self):
return RCloneLocal(location=self.location)

def get_available_name(self, name, max_length=None):
"""
A hack to overwrite by default with the FileSystemStorage.
Expand Down
5 changes: 5 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1845,6 +1845,7 @@ def add_features(sender, **kwargs):
USE_SPHINX_BUILDERS = "use_sphinx_builders"
CANCEL_OLD_BUILDS = "cancel_old_builds"
DONT_CREATE_INDEX = "dont_create_index"
USE_RCLONE = "use_rclone"

FEATURES = (
(ALLOW_DEPRECATED_WEBHOOKS, _('Allow deprecated webhook views')),
Expand Down Expand Up @@ -2001,6 +2002,10 @@ def add_features(sender, **kwargs):
DONT_CREATE_INDEX,
_('Do not create index.md or README.rst if the project does not have one.'),
),
(
USE_RCLONE,
_("Use rclone for syncing files to the media storage."),
),
)

projects = models.ManyToManyField(
Expand Down
5 changes: 4 additions & 1 deletion readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,7 +833,10 @@ def store_build_artifacts(
version_type=self.data.version.type,
)
try:
build_media_storage.sync_directory(from_path, to_path)
if self.data.project.has_feature(Feature.USE_RCLONE):
build_media_storage.rclone_sync(from_path, to_path)
stsewd marked this conversation as resolved.
Show resolved Hide resolved
else:
build_media_storage.sync_directory(from_path, to_path)
except Exception:
# Ideally this should just be an IOError
# but some storage backends unfortunately throw other errors
Expand Down
61 changes: 61 additions & 0 deletions readthedocs/rtd_tests/tests/test_build_storage.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
import shutil
import tempfile
from pathlib import Path

import pytest
from django.core.exceptions import SuspiciousFileOperation
from django.test import TestCase, override_settings

from readthedocs.builds.storage import BuildMediaFileSystemStorage
Expand Down Expand Up @@ -118,3 +121,61 @@ def test_walk(self):
self.assertEqual(top, 'files/api')
self.assertCountEqual(dirs, [])
self.assertCountEqual(files, ['index.html'])

def test_rclone_sync(self):
tmp_files_dir = Path(tempfile.mkdtemp()) / "files"
shutil.copytree(files_dir, tmp_files_dir, symlinks=True)
storage_dir = "files"

tree = [
("api", ["index.html"]),
"api.fjson",
"conf.py",
"test.html",
]
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

tree = [
("api", ["index.html"]),
"conf.py",
"test.html",
]
(tmp_files_dir / "api.fjson").unlink()
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

tree = [
"conf.py",
"test.html",
]
shutil.rmtree(tmp_files_dir / "api")
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

@pytest.mark.skip(
"Waiting for https://github.com/readthedocs/readthedocs.org/pull/9890"
)
def test_rclone_sync_source_symlink(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_symlink_dir = Path(tempfile.mkdtemp()) / "files"
tmp_symlink_dir.symlink_to(tmp_dir)

with override_settings(DOCROOT=tmp_dir):
with pytest.raises(SuspiciousFileOperation, match="symbolic link"):
self.storage.rclone_sync(tmp_symlink_dir, "files")

@pytest.mark.skip(
"Waiting for https://github.com/readthedocs/readthedocs.org/pull/9890"
)
def test_rclone_sync_source_outside_docroot(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_docroot = Path(tempfile.mkdtemp()) / "docroot"
tmp_docroot.mkdir()

with override_settings(DOCROOT=tmp_docroot):
with pytest.raises(SuspiciousFileOperation, match="outside the docroot"):
self.storage.rclone_sync(tmp_dir, "files")
185 changes: 185 additions & 0 deletions readthedocs/storage/rclone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
"""
Wrapper around the rclone command.

See https://rclone.org/docs.
"""

import os
import subprocess

import structlog
from django.utils._os import safe_join as safe_join_fs
from storages.utils import safe_join

log = structlog.get_logger(__name__)


class BaseRClone:

"""
RClone base class.

This class allows you to interact with an rclone remote without
a configuration file, the remote declaration and its options
are passed in the command itself.

This base class allows you to use the local file system as remote.

:param remote_type: You can see the full list of supported providers at
https://rclone.org/#providers.
:param rclone_bin: Binary name or path to the rclone binary.
Defaults to ``rclone``.
:param default_options: Options passed to the rclone command.
:parm env_vars: Environment variables used when executing the rclone command.
Useful to pass secrets to the command, since all arguments and options will be logged.
"""

remote_type = None
rclone_bin = "rclone"
default_options = [
# Number of file transfers to run in parallel.
"--transfers=8",
humitos marked this conversation as resolved.
Show resolved Hide resolved
# Skip based on checksum (if available) & size, not mod-time & size.
"--checksum",
"--verbose",
humitos marked this conversation as resolved.
Show resolved Hide resolved
]
env_vars = {}

# pylint: disable=no-self-use
def _build_target_path(self, path):
"""Build the final target path for the remote."""
return path
stsewd marked this conversation as resolved.
Show resolved Hide resolved

def build_target(self, path):
"""
Build the proper target using the current remote type.
stsewd marked this conversation as resolved.
Show resolved Hide resolved

We start the remote with `:` to create it on the fly,
instead of having to create a configuration file.
See https://rclone.org/docs/#backend-path-to-dir.

:param path: Path to the remote target.
"""
path = self._build_target_path(path)
return f":{self.remote_type}:{path}"

def execute(self, action, args, options=None):
"""
Execute an rclone subcommand.

:param action: Name of the subcommand.
stsewd marked this conversation as resolved.
Show resolved Hide resolved
:param list args: List of positional arguments passed the to command.
:param list options: List of options passed to the command.
humitos marked this conversation as resolved.
Show resolved Hide resolved
"""
options = options or []
command = [
self.rclone_bin,
action,
*self.default_options,
*options,
"--",
*args,
]
env = os.environ.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason why we want to use the same environment than where the process is running? can't we just pass only the variable from self.env_vars only?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you pass additional env vars, they override ALL the env variables, this means that other env vars like PATH will be undefined.

env.update(self.env_vars)
log.info("Executing rclone command.", command=command)
log.debug("env", env=env)
stsewd marked this conversation as resolved.
Show resolved Hide resolved
result = subprocess.run(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

subprocess.run will execute the command from the host machine. Wouldn't it be better to execute the command from inside the container instead? That will give us a lot more security, and reduce the attack vector since users won't be able to access any file from the host at all.

I think it's possible since we have the files to be uploaded in the host, and we only need to pass the correct environment variables only to that particular command.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed this with Eric some weeks ago, to make it secure it should be run from another container, otherwise the user could manipulate the executable to expose the secret env vars we pass to it. I'll be +1 on exploring that idea later.

Copy link
Member

@humitos humitos Jan 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! 💯 We should have some integrity checking before executing or similar. Sounds good to explore in a future iteration 👍🏼

command,
capture_output=True,
env=env,
# TODO: Fail or let the caller decide what to do?
check=True,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure about this one, we could manually check the exit code or catch the exception

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure to understand the doubt here. What are the different scenarios and their pros/cons?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the command exists with a status different than 0 with check=True it will raise an exception, if false (the default), then it won't raise an exception and continue normally (but you can still see check for the exit code manually)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should wrap it into a try/except block and raise a custom exception that we can handle from outside properly.

)
log.debug(
"Result.",
stsewd marked this conversation as resolved.
Show resolved Hide resolved
stdout=result.stdout.decode(),
stderr=result.stderr.decode(),
exit_code=result.returncode,
)
return result

def sync(self, source, destination):
"""
Run the `rclone sync` command.

See https://rclone.org/commands/rclone_sync/.

:params source: Local path to the source directory.
:params destination: Remote path to the destination directory.
"""
return self.execute("sync", args=[source, self.build_target(destination)])


class RCloneLocal(BaseRClone):

"""
RClone remote implementation for the local file system.
humitos marked this conversation as resolved.
Show resolved Hide resolved

Used for local testing only.
humitos marked this conversation as resolved.
Show resolved Hide resolved

See https://rclone.org/local/.

:param location: Root directory where the files will be stored.
"""

remote_type = "local"

def __init__(self, location):
self.location = location

def _build_target_path(self, path):
return safe_join_fs(self.location, path)


class RCloneS3Remote(BaseRClone):

"""
RClone remote implementation for S3.

All secrets will be passed as environ variables.
stsewd marked this conversation as resolved.
Show resolved Hide resolved

See https://rclone.org/s3/.

:params bucket_name: Name of the S3 bucket.
:params access_key_id: AWS access key id.
:params secret_acces_key: AWS secret access key.
:params region: AWS region.
:params provider: S3 provider, defaults to ``AWS``.
Useful to use Minio during development.
See https://rclone.org/s3/#s3-provider.
:param acl: Canned ACL used when creating buckets and storing or copying objects.
See https://rclone.org/s3/#s3-acl.
stsewd marked this conversation as resolved.
Show resolved Hide resolved
:param endpoint: Custom S3 endpoint, useful for development.
"""

remote_type = "s3"

def __init__(
self,
bucket_name,
access_key_id,
secret_acces_key,
region,
provider="AWS",
acl=None,
endpoint=None,
):
# rclone S3 options passed as env vars.
# https://rclone.org/s3/#standard-options.
self.env_vars = {
"RCLONE_S3_PROVIDER": provider,
"RCLONE_S3_ACCESS_KEY_ID": access_key_id,
"RCLONE_S3_SECRET_ACCESS_KEY": secret_acces_key,
"RCLONE_S3_REGION": region,
"RCLONE_S3_LOCATION_CONSTRAINT": region,
}
if acl:
self.env_vars["RCLONE_S3_ACL"] = acl
if endpoint:
self.env_vars["RCLONE_S3_ENDPOINT"] = endpoint
self.bucket_name = bucket_name

def _build_target_path(self, path):
"""Overridden to prepend the bucket name to the path."""
return safe_join(self.bucket_name, path)
Loading