Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build: use rclone for sync #9842

Merged
merged 7 commits into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ jobs:
- checkout
- run: git submodule sync
- run: git submodule update --init
- run: sudo apt update
- run: sudo apt install -y rclone
- run: pip install --user 'tox<5'
- run: tox -e py310
- codecov/upload
Expand Down
3 changes: 2 additions & 1 deletion dockerfiles/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ RUN apt-get -y install \
netcat \
telnet \
lsb-release \
npm
npm \
rclone

# Gets the MinIO mc client used to add buckets upon initialization
# If this client should have issues running inside this image, it is also
Expand Down
18 changes: 18 additions & 0 deletions readthedocs/builds/storage.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from functools import cached_property
from pathlib import Path

import structlog
Expand All @@ -7,6 +8,7 @@
from storages.utils import get_available_overwrite_name, safe_join

from readthedocs.core.utils.filesystem import safe_open
from readthedocs.storage.rclone import RCloneLocal

log = structlog.get_logger(__name__)

Expand Down Expand Up @@ -172,6 +174,18 @@ def sync_directory(self, source, destination):
log.debug('Deleting file from media storage.', filepath=filepath)
self.delete(filepath)

@cached_property
def _rclone(self):
raise NotImplementedError

def rclone_sync_directory(self, source, destination):
"""Sync a directory recursively to storage using rclone sync."""
if destination in ("", "/"):
raise SuspiciousFileOperation("Syncing all storage cannot be right")
humitos marked this conversation as resolved.
Show resolved Hide resolved

self._check_suspicious_path(source)
return self._rclone.sync(source, destination)

def join(self, directory, filepath):
return safe_join(directory, filepath)

Expand Down Expand Up @@ -206,6 +220,10 @@ def __init__(self, **kwargs):

super().__init__(location)

@cached_property
def _rclone(self):
return RCloneLocal(location=self.location)

def get_available_name(self, name, max_length=None):
"""
A hack to overwrite by default with the FileSystemStorage.
Expand Down
5 changes: 5 additions & 0 deletions readthedocs/projects/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1850,6 +1850,7 @@ def add_features(sender, **kwargs):
USE_SPHINX_BUILDERS = "use_sphinx_builders"
CANCEL_OLD_BUILDS = "cancel_old_builds"
DONT_CREATE_INDEX = "dont_create_index"
USE_RCLONE = "use_rclone"

FEATURES = (
(ALLOW_DEPRECATED_WEBHOOKS, _('Allow deprecated webhook views')),
Expand Down Expand Up @@ -2006,6 +2007,10 @@ def add_features(sender, **kwargs):
DONT_CREATE_INDEX,
_('Do not create index.md or README.rst if the project does not have one.'),
),
(
USE_RCLONE,
_("Use rclone for syncing files to the media storage."),
),
)

projects = models.ManyToManyField(
Expand Down
5 changes: 4 additions & 1 deletion readthedocs/projects/tasks/builds.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,7 +839,10 @@ def store_build_artifacts(self):
version_type=self.data.version.type,
)
try:
build_media_storage.sync_directory(from_path, to_path)
if self.data.project.has_feature(Feature.USE_RCLONE):
build_media_storage.rclone_sync_directory(from_path, to_path)
else:
build_media_storage.sync_directory(from_path, to_path)
except Exception:
# Ideally this should just be an IOError
# but some storage backends unfortunately throw other errors
Expand Down
52 changes: 52 additions & 0 deletions readthedocs/rtd_tests/tests/test_build_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,55 @@ def test_walk(self):
self.assertEqual(top, 'files/api')
self.assertCountEqual(dirs, [])
self.assertCountEqual(files, ['index.html'])

def test_rclone_sync(self):
tmp_files_dir = Path(tempfile.mkdtemp()) / "files"
shutil.copytree(files_dir, tmp_files_dir, symlinks=True)
storage_dir = "files"

tree = [
("api", ["index.html"]),
"api.fjson",
"conf.py",
"test.html",
]
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

tree = [
("api", ["index.html"]),
"conf.py",
"test.html",
]
(tmp_files_dir / "api.fjson").unlink()
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

tree = [
"conf.py",
"test.html",
]
shutil.rmtree(tmp_files_dir / "api")
with override_settings(DOCROOT=tmp_files_dir):
self.storage.rclone_sync_directory(tmp_files_dir, storage_dir)
self.assertFileTree(storage_dir, tree)

def test_rclone_sync_source_symlink(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_symlink_dir = Path(tempfile.mkdtemp()) / "files"
tmp_symlink_dir.symlink_to(tmp_dir)

with override_settings(DOCROOT=tmp_dir):
with pytest.raises(SuspiciousFileOperation, match="symbolic link"):
self.storage.rclone_sync_directory(tmp_symlink_dir, "files")

def test_rclone_sync_source_outside_docroot(self):
tmp_dir = Path(tempfile.mkdtemp())
tmp_docroot = Path(tempfile.mkdtemp()) / "docroot"
tmp_docroot.mkdir()

with override_settings(DOCROOT=tmp_docroot):
with pytest.raises(SuspiciousFileOperation, match="outside the docroot"):
self.storage.rclone_sync_directory(tmp_dir, "files")
192 changes: 192 additions & 0 deletions readthedocs/storage/rclone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""
Wrapper around the rclone command.

See https://rclone.org/docs.
"""

import os
import subprocess

import structlog
from django.utils._os import safe_join as safe_join_fs
from storages.utils import safe_join

log = structlog.get_logger(__name__)


class BaseRClone:

"""
RClone base class.

This class allows you to interact with an rclone remote without
a configuration file, the remote declaration and its options
are passed in the command itself.

This base class allows you to use the local file system as remote.

:param remote_type: You can see the full list of supported providers at
https://rclone.org/#providers.
:param rclone_bin: Binary name or path to the rclone binary.
Defaults to ``rclone``.
:param default_options: Options passed to the rclone command.
:parm env_vars: Environment variables used when executing the rclone command.
Useful to pass secrets to the ``rclone` command, since all arguments and
options will be logged.
"""

remote_type = None
rclone_bin = "rclone"
default_options = [
# Number of file transfers to run in parallel.
# Default value is 4.
"--transfers=8",
# Skip based on checksum (if available) & size, not mod-time & size.
"--checksum",
"--verbose",
humitos marked this conversation as resolved.
Show resolved Hide resolved
]
env_vars = {}

def _get_target_path(self, path):
"""
Get the final target path for the remote.

.. note::

This doesn't include the remote type,
this is just the destination path.
"""
raise NotImplementedError

def get_target(self, path):
"""
Get the proper target using the current remote type.

We start the remote with `:` to create it on the fly,
instead of having to create a configuration file.
See https://rclone.org/docs/#backend-path-to-dir.

:param path: Path to the remote target.
"""
path = self._get_target_path(path)
return f":{self.remote_type}:{path}"

def execute(self, subcommand, args, options=None):
"""
Execute an rclone subcommand.

:param subcommand: Name of the subcommand.
:param list args: List of positional arguments passed the to command.
:param list options: List of options passed to the command.
humitos marked this conversation as resolved.
Show resolved Hide resolved
"""
options = options or []
command = [
self.rclone_bin,
subcommand,
*self.default_options,
*options,
"--",
*args,
]
env = os.environ.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason why we want to use the same environment than where the process is running? can't we just pass only the variable from self.env_vars only?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When you pass additional env vars, they override ALL the env variables, this means that other env vars like PATH will be undefined.

env.update(self.env_vars)
log.info("Executing rclone command.", command=command)
log.debug("Executing rclone commmad.", env=env)
result = subprocess.run(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

subprocess.run will execute the command from the host machine. Wouldn't it be better to execute the command from inside the container instead? That will give us a lot more security, and reduce the attack vector since users won't be able to access any file from the host at all.

I think it's possible since we have the files to be uploaded in the host, and we only need to pass the correct environment variables only to that particular command.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We discussed this with Eric some weeks ago, to make it secure it should be run from another container, otherwise the user could manipulate the executable to expose the secret env vars we pass to it. I'll be +1 on exploring that idea later.

Copy link
Member

@humitos humitos Jan 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point! 💯 We should have some integrity checking before executing or similar. Sounds good to explore in a future iteration 👍🏼

command,
capture_output=True,
env=env,
check=True,
)
log.debug(
"rclone execution finished.",
stdout=result.stdout.decode(),
stderr=result.stderr.decode(),
exit_code=result.returncode,
)
return result

def sync(self, source, destination):
"""
Run the `rclone sync` command.

See https://rclone.org/commands/rclone_sync/.

:params source: Local path to the source directory.
:params destination: Remote path to the destination directory.
"""
return self.execute("sync", args=[source, self.get_target(destination)])


class RCloneLocal(BaseRClone):

"""
RClone remote implementation for the local file system.
humitos marked this conversation as resolved.
Show resolved Hide resolved

Used for local testing only.
humitos marked this conversation as resolved.
Show resolved Hide resolved

See https://rclone.org/local/.

:param location: Root directory where the files will be stored.
"""

remote_type = "local"

def __init__(self, location):
self.location = location

def _get_target_path(self, path):
return safe_join_fs(self.location, path)


class RCloneS3Remote(BaseRClone):

"""
RClone remote implementation for S3.

All secrets will be passed as environ variables to the rclone command.

See https://rclone.org/s3/.

:params bucket_name: Name of the S3 bucket.
:params access_key_id: AWS access key id.
:params secret_acces_key: AWS secret access key.
:params region: AWS region.
:params provider: S3 provider, defaults to ``AWS``.
Useful to use Minio during development.
See https://rclone.org/s3/#s3-provider.
:param acl: Canned ACL used when creating buckets and storing or copying objects.
See https://rclone.org/s3/#s3-acl.
stsewd marked this conversation as resolved.
Show resolved Hide resolved
:param endpoint: Custom S3 endpoint, useful for development.
"""

remote_type = "s3"

def __init__(
self,
bucket_name,
access_key_id,
secret_acces_key,
region,
provider="AWS",
acl=None,
endpoint=None,
):
# rclone S3 options passed as env vars.
# https://rclone.org/s3/#standard-options.
self.env_vars = {
"RCLONE_S3_PROVIDER": provider,
"RCLONE_S3_ACCESS_KEY_ID": access_key_id,
"RCLONE_S3_SECRET_ACCESS_KEY": secret_acces_key,
"RCLONE_S3_REGION": region,
"RCLONE_S3_LOCATION_CONSTRAINT": region,
}
if acl:
self.env_vars["RCLONE_S3_ACL"] = acl
if endpoint:
self.env_vars["RCLONE_S3_ENDPOINT"] = endpoint
self.bucket_name = bucket_name

def _get_target_path(self, path):
"""Overridden to prepend the bucket name to the path."""
return safe_join(self.bucket_name, path)
Loading