From fb4f1f55caf223685e55dc39e07c8bcf277a3817 Mon Sep 17 00:00:00 2001 From: Gianmaria Del Monte <39946305+gmgigi96@users.noreply.github.com> Date: Tue, 14 Sep 2021 07:34:08 +0200 Subject: [PATCH] Implement folder download as archive (#2066) --- changelog/unreleased/archiver-service.md | 7 + internal/http/services/archiver/archiver.go | 348 ++++++++++++++++++++ internal/http/services/loader/loader.go | 1 + pkg/storage/utils/walker/walker.go | 119 +++++++ 4 files changed, 475 insertions(+) create mode 100644 changelog/unreleased/archiver-service.md create mode 100644 internal/http/services/archiver/archiver.go create mode 100644 pkg/storage/utils/walker/walker.go diff --git a/changelog/unreleased/archiver-service.md b/changelog/unreleased/archiver-service.md new file mode 100644 index 00000000000..d65ab1c91c2 --- /dev/null +++ b/changelog/unreleased/archiver-service.md @@ -0,0 +1,7 @@ +Enhancement: Implement folder download as archive + +Adds a new http service which will create an archive +(platform dependent, zip in windows and tar in linux) given a list of file. + +https://github.com/cs3org/reva/issues/1698 +https://github.com/cs3org/reva/pull/2066 diff --git a/internal/http/services/archiver/archiver.go b/internal/http/services/archiver/archiver.go new file mode 100644 index 00000000000..8cfeab83d75 --- /dev/null +++ b/internal/http/services/archiver/archiver.go @@ -0,0 +1,348 @@ +// Copyright 2018-2021 CERN +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// In applying this license, CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +package archiver + +import ( + "archive/tar" + "archive/zip" + "context" + "fmt" + "io" + "net/http" + "path" + "strings" + "time" + + gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1" + rpc "github.com/cs3org/go-cs3apis/cs3/rpc/v1beta1" + provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1" + "github.com/cs3org/reva/internal/http/services/datagateway" + "github.com/cs3org/reva/pkg/errtypes" + "github.com/cs3org/reva/pkg/rgrpc/todo/pool" + "github.com/cs3org/reva/pkg/rhttp" + "github.com/cs3org/reva/pkg/rhttp/global" + "github.com/cs3org/reva/pkg/sharedconf" + "github.com/cs3org/reva/pkg/storage/utils/walker" + "github.com/gdexlab/go-render/render" + ua "github.com/mileusna/useragent" + "github.com/mitchellh/mapstructure" + "github.com/rs/zerolog" +) + +type svc struct { + config *Config + httpClient *http.Client + gtwClient gateway.GatewayAPIClient + log *zerolog.Logger +} + +// Config holds the config options that need to be passed down to all ocdav handlers +type Config struct { + Prefix string `mapstructure:"prefix"` + GatewaySvc string `mapstructure:"gatewaysvc"` + Timeout int64 `mapstructure:"timeout"` + Insecure bool `mapstructure:"insecure"` + MaxNumFiles int64 `mapstructure:"max_num_files"` + MaxSize int64 `mapstructure:"max_size"` +} + +var ( + errMaxFileCount = errtypes.InternalError("reached max files count") + errMaxSize = errtypes.InternalError("reached max total files size") +) + +func init() { + global.Register("archiver", New) +} + +// New creates a new archiver service +func New(conf map[string]interface{}, log *zerolog.Logger) (global.Service, error) { + c := &Config{} + err := mapstructure.Decode(conf, c) + if err != nil { + return nil, err + } + + c.init() + + gtw, err := pool.GetGatewayServiceClient(c.GatewaySvc) + if err != nil { + return nil, err + } + + return &svc{ + config: c, + gtwClient: gtw, + httpClient: rhttp.GetHTTPClient( + rhttp.Timeout(time.Duration(c.Timeout*int64(time.Second))), + rhttp.Insecure(c.Insecure), + ), + log: log, + }, nil +} + +func (c *Config) init() { + if c.Prefix == "" { + c.Prefix = "download_archive" + } + + c.GatewaySvc = sharedconf.GetGatewaySVC(c.GatewaySvc) +} + +func (s *svc) Handler() http.Handler { + return http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + // get the dir and files to archive from the URL + ctx := r.Context() + v := r.URL.Query() + if _, ok := v["dir"]; !ok { + rw.WriteHeader(http.StatusBadRequest) + return + } + dir := v["dir"][0] + + names, ok := v["file"] + if !ok { + names = []string{} + } + + // append to the files name the dir + files := []string{} + for _, f := range names { + p := path.Join(dir, f) + files = append(files, strings.TrimSuffix(p, "/")) + } + + userAgent := ua.Parse(r.Header.Get("User-Agent")) + + archiveName := "download" + if len(files) == 0 { + // we need to archive the whole dir + files = append(files, dir) + archiveName = path.Base(dir) + } + + if userAgent.OS == ua.Windows { + archiveName += ".zip" + } else { + archiveName += ".tar" + } + + s.log.Debug().Msg("Requested the following files/folders to archive: " + render.Render(files)) + + rw.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", archiveName)) + rw.Header().Set("Content-Transfer-Encoding", "binary") + + var err error + if userAgent.OS == ua.Windows { + err = s.createZip(ctx, dir, files, rw) + } else { + err = s.createTar(ctx, dir, files, rw) + } + if err == errMaxFileCount || err == errMaxSize { + s.log.Error().Msg(err.Error()) + rw.WriteHeader(http.StatusRequestEntityTooLarge) + return + } + if err != nil { + s.log.Error().Msg(err.Error()) + rw.WriteHeader(http.StatusInternalServerError) + return + } + + }) +} + +func (s *svc) Prefix() string { + return s.config.Prefix +} + +func (s *svc) Close() error { + return nil +} + +func (s *svc) Unprotected() []string { + return nil +} + +// create a new tar containing the files in the `files` list, which are in the directory `dir` +func (s *svc) createTar(ctx context.Context, dir string, files []string, dst io.Writer) error { + w := tar.NewWriter(dst) + + var filesCount, sizeFiles int64 + + for _, root := range files { + + err := walker.Walk(ctx, root, s.gtwClient, func(path string, info *provider.ResourceInfo, err error) error { + if err != nil { + return err + } + + filesCount++ + if filesCount > s.config.MaxNumFiles { + return errMaxFileCount + } + sizeFiles += int64(info.Size) + if sizeFiles > s.config.MaxSize { + return errMaxSize + } + + fileName := strings.TrimPrefix(path, dir) + + header := tar.Header{ + Name: fileName, + ModTime: time.Unix(int64(info.Mtime.Seconds), 0), + } + + isDir := info.Type == provider.ResourceType_RESOURCE_TYPE_CONTAINER + + if isDir { + // the resource is a folder + header.Mode = 0755 + header.Typeflag = tar.TypeDir + } else { + header.Mode = 0644 + header.Typeflag = tar.TypeReg + header.Size = int64(info.Size) + } + + err = w.WriteHeader(&header) + if err != nil { + return err + } + + if !isDir { + err = s.downloadFile(ctx, path, w) + if err != nil { + return err + } + } + return nil + }) + + if err != nil { + return err + } + + } + return w.Close() +} + +// create a new zip containing the files in the `files` list, which are in the directory `dir` +func (s *svc) createZip(ctx context.Context, dir string, files []string, dst io.Writer) error { + w := zip.NewWriter(dst) + + var filesCount, sizeFiles int64 + + for _, root := range files { + + err := walker.Walk(ctx, root, s.gtwClient, func(path string, info *provider.ResourceInfo, err error) error { + if err != nil { + return err + } + + filesCount++ + if filesCount > s.config.MaxNumFiles { + return errMaxFileCount + } + sizeFiles += int64(info.Size) + if sizeFiles > s.config.MaxSize { + return errMaxSize + } + + fileName := strings.TrimPrefix(strings.Trim(path, dir), "/") + + if fileName == "" { + return nil + } + + header := zip.FileHeader{ + Name: fileName, + Modified: time.Unix(int64(info.Mtime.Seconds), 0), + } + + isDir := info.Type == provider.ResourceType_RESOURCE_TYPE_CONTAINER + + if isDir { + header.Name += "/" + } else { + header.UncompressedSize64 = info.Size + } + + dst, err := w.CreateHeader(&header) + if err != nil { + return err + } + + if !isDir { + err = s.downloadFile(ctx, path, dst) + if err != nil { + return err + } + } + return nil + }) + + if err != nil { + return err + } + + } + return w.Close() +} + +func (s *svc) downloadFile(ctx context.Context, path string, dst io.Writer) error { + downResp, err := s.gtwClient.InitiateFileDownload(ctx, &provider.InitiateFileDownloadRequest{ + Ref: &provider.Reference{ + Path: path, + }, + }) + + switch { + case err != nil: + return err + case downResp.Status.Code != rpc.Code_CODE_OK: + return errtypes.InternalError(downResp.Status.Message) + } + + var endpoint, token string + for _, p := range downResp.Protocols { + if p.Protocol == "simple" { + endpoint, token = p.DownloadEndpoint, p.Token + } + } + + httpReq, err := rhttp.NewRequest(ctx, http.MethodGet, endpoint, nil) + if err != nil { + return err + } + httpReq.Header.Set(datagateway.TokenTransportHeader, token) + + httpRes, err := s.httpClient.Do(httpReq) + if err != nil { + return err + } + defer httpRes.Body.Close() + + if httpRes.StatusCode != http.StatusOK { + return errtypes.InternalError(httpRes.Status) + } + + _, err = io.Copy(dst, httpRes.Body) + return err +} diff --git a/internal/http/services/loader/loader.go b/internal/http/services/loader/loader.go index 73ac5e27197..07edbc6aeca 100644 --- a/internal/http/services/loader/loader.go +++ b/internal/http/services/loader/loader.go @@ -21,6 +21,7 @@ package loader import ( // Load core HTTP services _ "github.com/cs3org/reva/internal/http/services/appprovider" + _ "github.com/cs3org/reva/internal/http/services/archiver" _ "github.com/cs3org/reva/internal/http/services/datagateway" _ "github.com/cs3org/reva/internal/http/services/dataprovider" _ "github.com/cs3org/reva/internal/http/services/helloworld" diff --git a/pkg/storage/utils/walker/walker.go b/pkg/storage/utils/walker/walker.go new file mode 100644 index 00000000000..7a4d3b2d1d6 --- /dev/null +++ b/pkg/storage/utils/walker/walker.go @@ -0,0 +1,119 @@ +// Copyright 2018-2021 CERN +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// In applying this license, CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +package walker + +import ( + "context" + "fmt" + "path/filepath" + + gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1" + rpc "github.com/cs3org/go-cs3apis/cs3/rpc/v1beta1" + provider "github.com/cs3org/go-cs3apis/cs3/storage/provider/v1beta1" + + "github.com/cs3org/reva/pkg/errtypes" +) + +// WalkFunc is the type of function called by Walk to visit each file or directory +// +// Each time the Walk function meet a file/folder path is set to the full path of this. +// The err argument reports an error related to the path, and the function can decide the action to +// do with this. +// +// The error result returned by the function controls how Walk continues. If the function returns the special value SkipDir, Walk skips the current directory. +// Otherwise, if the function returns a non-nil error, Walk stops entirely and returns that error. +type WalkFunc func(path string, info *provider.ResourceInfo, err error) error + +// Walk walks the file tree rooted at root, calling fn for each file or folder in the tree, including the root. +func Walk(ctx context.Context, root string, gtw gateway.GatewayAPIClient, fn WalkFunc) error { + info, err := stat(ctx, root, gtw) + + if err != nil { + return fn(root, nil, err) + } + + err = walkRecursively(ctx, root, info, gtw, fn) + + if err == filepath.SkipDir { + return nil + } + + return err +} + +func walkRecursively(ctx context.Context, path string, info *provider.ResourceInfo, gtw gateway.GatewayAPIClient, fn WalkFunc) error { + + if info.Type != provider.ResourceType_RESOURCE_TYPE_CONTAINER { + return fn(path, info, nil) + } + + list, err := readDir(ctx, path, gtw) + errFn := fn(path, info, err) + + if err != nil || errFn != nil { + return errFn + } + + for _, file := range list { + err = walkRecursively(ctx, file.Path, file, gtw, fn) + if err != nil && (file.Type != provider.ResourceType_RESOURCE_TYPE_CONTAINER || err != filepath.SkipDir) { + return err + } + } + + return nil +} + +func readDir(ctx context.Context, path string, gtw gateway.GatewayAPIClient) ([]*provider.ResourceInfo, error) { + resp, err := gtw.ListContainer(ctx, &provider.ListContainerRequest{ + Ref: &provider.Reference{ + Path: path, + }, + }) + + switch { + case err != nil: + return nil, err + case resp.Status.Code == rpc.Code_CODE_NOT_FOUND: + return nil, errtypes.NotFound(path) + case resp.Status.Code != rpc.Code_CODE_OK: + return nil, errtypes.InternalError(fmt.Sprintf("error reading dir %s", path)) + } + + return resp.Infos, nil +} + +func stat(ctx context.Context, path string, gtw gateway.GatewayAPIClient) (*provider.ResourceInfo, error) { + resp, err := gtw.Stat(ctx, &provider.StatRequest{ + Ref: &provider.Reference{ + Path: path, + }, + }) + + switch { + case err != nil: + return nil, err + case resp.Status.Code == rpc.Code_CODE_NOT_FOUND: + return nil, errtypes.NotFound(path) + case resp.Status.Code != rpc.Code_CODE_OK: + return nil, errtypes.InternalError(fmt.Sprintf("error getting stats from %s", path)) + } + + return resp.Info, nil +}