Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remote: implement Google Drive #2040

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include dvc/remote/gdrive/google-dvc-client-id.json
11 changes: 11 additions & 0 deletions dvc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ class Config(object): # pylint: disable=too-many-instance-attributes
SECTION_CORE_ANALYTICS_SCHEMA = BOOL_SCHEMA
SECTION_CORE_CHECKSUM_JOBS = "checksum_jobs"
SECTION_CORE_CHECKSUM_JOBS_SCHEMA = And(Use(int), lambda x: x > 0)
SECTION_CORE_OAUTH2_FLOW_RUNNER = "oauth2_flow_runner"
SECTION_CORE_OAUTH2_FLOW_RUNNER_SCHEMA = Choices("console", "local")

SECTION_CACHE = "cache"
SECTION_CACHE_DIR = "dir"
Expand Down Expand Up @@ -195,6 +197,9 @@ class Config(object): # pylint: disable=too-many-instance-attributes
Optional(
SECTION_CORE_CHECKSUM_JOBS, default=None
): SECTION_CORE_CHECKSUM_JOBS_SCHEMA,
Optional(
SECTION_CORE_OAUTH2_FLOW_RUNNER, default="console"
): SECTION_CORE_OAUTH2_FLOW_RUNNER_SCHEMA,
}

# backward compatibility
Expand Down Expand Up @@ -228,6 +233,10 @@ class Config(object): # pylint: disable=too-many-instance-attributes
Optional(SECTION_GCP_PROJECTNAME): str,
}

SECTION_GDRIVE_SCOPES = "scopes"
SECTION_GDRIVE_CREDENTIALPATH = SECTION_AWS_CREDENTIALPATH
ei-grad marked this conversation as resolved.
Show resolved Hide resolved
SECTION_GDRIVE_OAUTH_ID = "oauth_id"

# backward compatibility
SECTION_LOCAL = "local"
SECTION_LOCAL_STORAGEPATH = SECTION_AWS_STORAGEPATH
Expand Down Expand Up @@ -259,6 +268,8 @@ class Config(object): # pylint: disable=too-many-instance-attributes
Optional(SECTION_AWS_USE_SSL, default=True): BOOL_SCHEMA,
Optional(SECTION_AWS_SSE): str,
Optional(SECTION_GCP_PROJECTNAME): str,
Optional(SECTION_GDRIVE_SCOPES): str,
Optional(SECTION_GDRIVE_OAUTH_ID, default="default"): str,
Optional(SECTION_CACHE_TYPE): SECTION_CACHE_TYPE_SCHEMA,
Optional(SECTION_CACHE_PROTECTED, default=False): BOOL_SCHEMA,
Optional(SECTION_REMOTE_USER): str,
Expand Down
2 changes: 2 additions & 0 deletions dvc/data_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from dvc.remote import Remote
from dvc.remote.s3 import RemoteS3
from dvc.remote.gs import RemoteGS
from dvc.remote.gdrive import RemoteGDrive
from dvc.remote.azure import RemoteAZURE
from dvc.remote.oss import RemoteOSS
from dvc.remote.ssh import RemoteSSH
Expand All @@ -33,6 +34,7 @@ class DataCloud(object):
CLOUD_MAP = {
"aws": RemoteS3,
"gcp": RemoteGS,
"gdrive": RemoteGDrive,
"azure": RemoteAZURE,
"oss": RemoteOSS,
"ssh": RemoteSSH,
Expand Down
2 changes: 2 additions & 0 deletions dvc/remote/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import unicode_literals

from dvc.remote.azure import RemoteAZURE
from dvc.remote.gdrive import RemoteGDrive
from dvc.remote.gs import RemoteGS
from dvc.remote.hdfs import RemoteHDFS
from dvc.remote.local import RemoteLOCAL
Expand All @@ -15,6 +16,7 @@

REMOTES = [
RemoteAZURE,
RemoteGDrive,
RemoteGS,
RemoteHDFS,
RemoteHTTP,
Expand Down
202 changes: 202 additions & 0 deletions dvc/remote/gdrive/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
from __future__ import unicode_literals

import os
import logging

try:
import google_auth_oauthlib
from dvc.remote.gdrive.client import GDriveClient
except ImportError:
google_auth_oauthlib = None

from dvc.scheme import Schemes
from dvc.path_info import CloudURLInfo
from dvc.remote.base import RemoteBASE
from dvc.config import Config
from dvc.remote.gdrive.utils import (
TrackFileReadProgress,
only_once,
metadata_isdir,
shared_token_warning,
)
from dvc.remote.gdrive.exceptions import GDriveError, GDriveResourceNotFound


logger = logging.getLogger(__name__)


class GDriveURLInfo(CloudURLInfo):
@property
def netloc(self):
return self.parsed.netloc


class RemoteGDrive(RemoteBASE):
"""Google Drive remote implementation

## Some notes on Google Drive design

Google Drive differs from S3 and GS remotes - it identifies the resources
by IDs instead of paths.

Folders are regular resources with an `application/vnd.google-apps.folder`
MIME type. Resource can have multiple parent folders, and also there could
be multiple resources with the same name linked to a single folder, so
files could be duplicated.

There are multiple root folders accessible from a single user account:
- `root` (special ID) - alias for the "My Drive" folder
- `appDataFolder` (special ID) - alias for the hidden application
space root folder
- shared drives root folders

## Example URLs

- Datasets/my-dataset inside "My Drive" folder:

gdrive://root/Datasets/my-dataset

- Folder by ID (recommended):

gdrive://1r3UbnmS5B4-7YZPZmyqJuCxLVps1mASC

(get it https://drive.google.com/drive/folders/{here})

- Dataset named "my-dataset" in the hidden application folder:

gdrive://appDataFolder/my-dataset

(this one wouldn't be visible through Google Drive web UI and
ei-grad marked this conversation as resolved.
Show resolved Hide resolved
couldn't be shared)
"""

scheme = Schemes.GDRIVE
path_cls = GDriveURLInfo
REGEX = r"^gdrive://.*$"
REQUIRES = {"google-auth-oauthlib": google_auth_oauthlib}
PARAM_CHECKSUM = "md5Checksum"
SPACE_DRIVE = "drive"
SCOPE_DRIVE = "https://www.googleapis.com/auth/drive"
SPACE_APPDATA = "appDataFolder"
SCOPE_APPDATA = "https://www.googleapis.com/auth/drive.appdata"
DEFAULT_OAUTH_ID = "default"

# Default credential is needed to show the string of "Data Version
# Control" in OAuth dialog application name and icon in authorized
# applications list in Google account security settings. Also, the
# quota usage is limited by the application defined by client_id.
# The good practice would be to suggest the user to create their
# own application credentials.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are the default limits? how hard would it be to increase it?

DEFAULT_CREDENTIALPATH = os.path.join(
os.path.dirname(__file__), "google-dvc-client-id.json"
)

def __init__(self, repo, config):
super(RemoteGDrive, self).__init__(repo, config)
ei-grad marked this conversation as resolved.
Show resolved Hide resolved
self.path_info = self.path_cls(config[Config.SECTION_REMOTE_URL])
self.root = self.path_info.netloc.lower()
if self.root == self.SPACE_APPDATA.lower():
default_scopes = self.SCOPE_APPDATA
space = self.SPACE_APPDATA
else:
default_scopes = self.SCOPE_DRIVE
space = self.SPACE_DRIVE
if Config.SECTION_GDRIVE_CREDENTIALPATH not in config:
shared_token_warning()
credentialpath = config.get(
Config.SECTION_GDRIVE_CREDENTIALPATH,
self.DEFAULT_CREDENTIALPATH,
)
scopes = config.get(Config.SECTION_GDRIVE_SCOPES, default_scopes)
# scopes should be a list and it is space-delimited in all
# configs, and `.split()` also works for a single-element list
scopes = scopes.split()

core_config = self.repo.config.config[Config.SECTION_CORE]
oauth2_flow_runner = core_config.get(
Config.SECTION_CORE_OAUTH2_FLOW_RUNNER, "console"
)

self.client = GDriveClient(
space,
config.get(Config.SECTION_GDRIVE_OAUTH_ID, self.DEFAULT_OAUTH_ID),
credentialpath,
scopes,
oauth2_flow_runner,
)

def get_file_checksum(self, path_info):
metadata = self.client.get_metadata(path_info, fields=["md5Checksum"])
return metadata["md5Checksum"]

def exists(self, path_info):
return self.client.exists(path_info)

def batch_exists(self, path_infos, callback):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@efiop do we need to update anything to support threading here - I mean status, etc? you are changing something with @pared as far as I understand.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, batch_exists will be no longer needed after #2375 . As to threads in general, as long as self.client is thread-safe(looks like it is, but maybe @ei-grad could confirm/deny that) we will be fine.

results = []
for path_info in path_infos:
results.append(self.exists(path_info))
callback.update(str(path_info))
return results

def list_cache_paths(self):
ei-grad marked this conversation as resolved.
Show resolved Hide resolved
try:
root = self.client.get_metadata(self.path_info)
except GDriveResourceNotFound as e:
logger.debug("list_cache_paths: {}".format(e))
else:
prefix = self.path_info.path
for i in self.client.list_children(root["id"]):
yield prefix + "/" + i

@only_once
def mkdir(self, parent, name):
return self.client.mkdir(parent, name)

def makedirs(self, path_info):
parent = path_info.netloc
parts = iter(path_info.path.split("/"))
current_path = ["gdrive://" + path_info.netloc]
for part in parts:
try:
metadata = self.client.get_metadata(
self.path_cls.from_parts(
self.scheme, parent, path="/" + part
)
)
except GDriveResourceNotFound:
break
else:
current_path.append(part)
if not metadata_isdir(metadata):
raise GDriveError(
"{} is not a folder".format("/".join(current_path))
)
parent = metadata["id"]
to_create = [part] + list(parts)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should it be sublist of parts here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parts is a partially consumed iterator, but yeah, I'll rewrite it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exception/break condition was also valid, but DeepSource suggests too that iterating over iterator with the break/else construct and using the loop variable is something not readable and bug-risky. :)

for part in to_create:
parent = self.mkdir(parent, part)["id"]
return parent

def _upload(self, from_file, to_info, name, no_progress_bar):

dirname = to_info.parent.path
if dirname:
try:
parent = self.client.get_metadata(to_info.parent)
except GDriveResourceNotFound:
parent = self.makedirs(to_info.parent)
else:
parent = to_info.netloc

from_file = open(from_file, "rb")
if not no_progress_bar:
from_file = TrackFileReadProgress(name, from_file)

try:
self.client.upload(parent, to_info, from_file)
finally:
from_file.close()

def _download(self, from_info, to_file, name, no_progress_bar):
self.client.download(from_info, to_file, name, no_progress_bar)
Loading