From 033063abc99b7b11536330e872778440df3af1ac Mon Sep 17 00:00:00 2001 From: Pavel Zolotarevskiy Date: Mon, 11 Dec 2023 11:48:31 +0000 Subject: [PATCH] Add back Google Sheets support and remove outdated code (#23) * refactor: remove outdated functions * feat: add back google sheets support * pkg: remove unneeded dependencies * fix: fix the drive export url * test: update the tests to match new functions * docs: add a docstring for sheet_download_url * docs: update the documentation to the unified download function * docs: remove outdated information in the readme * docs: fix typos in the documentation * pkg: bump version to 0.2.0 * refactor: export a google_download_url function * fix: accept file paths and commands as download locations * docs: remove outdated information from the readme * test: expand test coverage * pkg: add compat entry for Downloads --- Project.toml | 10 +-- README.md | 73 +---------------- docs/src/features.md | 52 +++--------- docs/src/index.md | 3 +- src/GoogleDrive.jl | 189 ++++++++++--------------------------------- test/runtests.jl | 13 ++- 6 files changed, 70 insertions(+), 270 deletions(-) diff --git a/Project.toml b/Project.toml index 1ac4c43..cd0ef35 100644 --- a/Project.toml +++ b/Project.toml @@ -1,17 +1,11 @@ name = "GoogleDrive" uuid = "91feb7a0-3508-11ea-1e8e-afea2c1c9a19" authors = ["tejasvaidhyadev and contributors"] -version = "0.1.3" +version = "0.2.0" [deps] -DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe" -Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Downloads = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" -Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [compat] -DataDeps = "0.7.10" -Downloads = "1" -HTTP = "1.4.0" julia = "1.9" +Downloads = "1" diff --git a/README.md b/README.md index ce24a8b..49e43ef 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,7 @@ https://github.com/JuliaIO/GoogleDrive.jl ## Introduction -GoogleDrive.jl provides support for downloading files from Google Drive, -directly as an IO stream, -or on top of -[DataDeps.jl](https://github.com/oxinabox/DataDeps.jl). - +GoogleDrive.jl provides support for downloading files from Google Drive and Google Sheets. ### Installation Install the package using the @@ -32,9 +28,6 @@ pkg> add GoogleDrive ## Details -Some of the methods in this package -may no longer be needed -because of changes to Google Drive API. To download data into an IO stream from a special URL of the form `url = "https://docs.google.com/uc?export=download&id=1GqmszfSB_LHGQEQpSjoiPyDROZ5a8Ls4"`, @@ -59,63 +52,6 @@ google_download(url, io) str = String(take!(io)) # (this line for text data only) ``` - -### drive_download - `drive_download(URL, localdir)` - -Download file from Google drive. -The above function only Download file from google drive. - - -### sheet_handler - `sheet_handler(long_url)` - -Provide URL that can be use as link for registering in Datadeps - - -### google_download - `google_download(URL, localdir)` - -It can also be consider as "maybe google download function" -The function can be used to download a file from -google-drive, google-sheets or HTTP download method. - - -## Configuration -This package is build on top of -[DataDeps.jl](https://github.com/oxinabox/DataDeps.jl). -To configure, e.g., where downloaded files save to, and read from -(and to understand how that works), -see the DataDeps.jl readme. - - -## Examples - -Load the package with - -``` -julia> using GoogleDrive -``` - -### Loading different Embeddings - -Downloading CSV file from GoogleDrive using `google_download` function -``` -julia> google_download("https://docs.google.com/spreadsheets/d/1tbNIGxnp8wLk31DIMNPD_Hi_CmIdpdTPfzMUDcs1xE/edit#gid=0", "/home/iamtejas/Downloads") -┌ Info: Downloading -│ source = "https://docs.google.com/spreadsheets/d/1tbNI-Gxnp8wLk31DIMNPD_Hi_CmIdpdTPfzMUDcs1xE/export?format=csv" -│ dest = "/home/iamtejas/Downloads/InternshipsToApplyFor-Sheet1.csv" -│ progress = NaN -│ time_taken = "0.05 s" -│ time_remaining = "NaN s" -│ average_speed = "84.961 KiB/s" -│ downloaded = "4.163 KiB" -│ remaining = "∞ B" -└ total = "∞ B" -"/home/iamtejas/Downloads/InternshipsToApplyFor-Sheet1.csv" - -``` - ## Contributing and Reporting Bugs Contributions, in the form of bug-reports, pull requests, additional documentation are encouraged. They can be made to the Github repository. @@ -123,13 +59,6 @@ Contributions, in the form of bug-reports, pull requests, additional documentati **All contributions and communications should abide by the [Julia Community Standards](https://julialang.org/community/standards/).** - -### Reference - -Code:- -[Peter Cheng](https://github.com/chengchingwen/Transformers.jl), [HTTPS](https://github.com/JuliaWeb/HTTP.jl) - - [action-img]: https://github.com/JuliaIO/GoogleDrive.jl/workflows/CI/badge.svg [action-url]: https://github.com/JuliaIO/GoogleDrive.jl/actions diff --git a/docs/src/features.md b/docs/src/features.md index 2ed2f94..db1622e 100644 --- a/docs/src/features.md +++ b/docs/src/features.md @@ -1,52 +1,26 @@ ## functions -### drive_download - `drive_download(URL, localdir)` +### google_download + +`google_download(URL, IO)` +Download a file from Google Drive or Google Sheets. -Download flie from Google drive. -The above function only Download file from google drive. -Downloading CSV file from google_drive using google_download function #### Example -``` -julia>drive_download("https://drive.google.com/file/d/0B9w48e1rj-MOLVdZRzFfTlNsem8/view") -┌ Info: Downloading -│ source = "https://drive.google.com/file/d/0B9w48e1rj-MOLVdZRzFfTlNsem8/view" -│ dest = "/home/iamtejas/Downloads/InternshipsToApplyFor-Sheet1.csv" -│ progress = NaN -│ time_taken = "0.05 s" -│ time_remaining = "NaN s" -│ average_speed = "84.961 KiB/s" -│ downloaded = "4.163 KiB" -│ remaining = "∞ B" -└ total = "∞ B" -"/home/iamtejas/Downloads/InternshipsToApplyFor-Sheet1.csv" +Downloading a ZIP file from Drive using the `google_download` function +``` +julia> google_download("https://drive.google.com/file/d/0B9w48e1rj-MOLVdZRzFfTlNsem8/view", "file.zip") +"/home/iamtejas/Downloads/file.zip" ``` -### sheet_handler - `sheet_handler(long_url)` - -Provide URL that can be use as link for registering in Datadeps +### google_download_url -### google_download - `google_download(URL, localdir)` +Convert a direct Google Drive / Google Sheets URL to the direct download form. -It can also be consider as "maybe google download function" -The function can be used to download file from google-drive,google-sheets or HTTP download method #### Example -``` -julia>google_download("https://docs.google.com/spreadsheets/d/1tbNIGxnp8wLk31DIMNPD_Hi_CmIdpdTPfzMUDcs1xE/edit#gid=0","/home/iamtejas/Downloads") -┌ Info: Downloading -│ source = "https://docs.google.com/spreadsheets/d/1tbNI-Gxnp8wLk31DIMNPD_Hi_CmIdpdTPfzMUDcs1xE/export?format=csv" -│ dest = "/home/iamtejas/Downloads/InternshipsToApplyFor-Sheet1.csv" -│ progress = NaN -│ time_taken = "0.05 s" -│ time_remaining = "NaN s" -│ average_speed = "84.961 KiB/s" -│ downloaded = "4.163 KiB" -│ remaining = "∞ B" -└ total = "∞ B" -"/home/iamtejas/Downloads/InternshipsToApplyFor-Sheet1.csv" ``` +julia> google_download_url("https://drive.google.com/file/d/0B9w48e1rj-MOLVdZRzFfTlNsem8/view") +"https://docs.google.com/uc?export=download&id=0B9w48e1rj-MOLVdZRzFfTlNsem8" +``` diff --git a/docs/src/index.md b/docs/src/index.md index 180f465..f117543 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -6,8 +6,7 @@ ``` ## Preface -This manual is designed to get you started using GoogleDrive.jl Package in julia in Julia. -It assumes that you already familiar with the basic of [datadeps](https://white.ucc.asn.au/DataDeps.jl/stable/index.html). +This manual is designed to get you started using GoogleDrive.jl Package in Julia. ## Installation diff --git a/src/GoogleDrive.jl b/src/GoogleDrive.jl index ecba3eb..8b9f154 100644 --- a/src/GoogleDrive.jl +++ b/src/GoogleDrive.jl @@ -1,169 +1,68 @@ module GoogleDrive -using DataDeps -using Dates: now, Millisecond +using Base: AbstractCmd using Downloads: download -using Random: randstring -using HTTP -export drive_download -export sheet_handler -export google_download - - -" - google_download_url(url::AbstractString)::String -Convert a GoogleDrive URL of the form -`https://drive.google.com/file/d/XYZ` -to the form needed for raw data download: -`https://docs.google.com/uc?export=download&id=XYZ` -" -function google_download_url(url::AbstractString) - old = "https://drive.google.com/file/d/" - new = "https://docs.google.com/uc?export=download&id=" - startswith(url, old) || startswith(url, new) || - throw(ArgumentError("Unknown URL form $url")) - return replace(url, old => new) -end +export google_download, google_download_url """ - google_download(url::AbstractString, io::IO) -Download data from Google URL `url` into `io`, returning `io`. + google_download(url::AbstractString, location::Union{AbstractString,AbstractCmd,IO}) +Download data from Google URL `url` into `location`, returning `location`. -This mutates `io` so arguably it should be named `google_download!`, -but `Downloads.download` also mutates an `IO` argument -so we follow its convention. +`location` can be of any type that `Downloads.download` accepts - so a file +path, a command, or an IO buffer. """ -function google_download(url::AbstractString, io::IO) +function google_download(url::AbstractString, location::Union{AbstractString,AbstractCmd,IO}) url = google_download_url(url) - return download(url, io) + return download(url, location) end - """ - unshortlink(url) -return unshorten url or the url if it is not a short link + google_download_url(url::AbstractString) +Convert a direct Google Drive/Sheets URL to a direct download +URL. The resulting URL can be used with `Downloads`, `DataDeps` +or any other data fetching package. """ -function unshortlink(url; kw...) - rq = HTTP.request("HEAD", url; redirect=false, status_exception=false, kw...) - while rq.status ÷ 100 == 3 - url = HTTP.header(rq, "Location") - rq = HTTP.request("HEAD", url; redirect=false, status_exception=false, kw...) - end - url -end - - -isg_sheet(url) = occursin("docs.google.com/spreadsheets", url) -isg_drive(url) = occursin("drive.google.com", url) - -function sheet_handler(url; format=:csv) - link, expo = splitdir(url) - if startswith(expo, "edit") || expo == "" - url = link * "/export?format=$format" - elseif startswith(expo, "export") - url = replace(url, r"format=([a-zA-Z]*)(.*)"=>SubstitutionString("format=$format\\2")) - end - url -end - -function google_download(url, localdir) - long_url = unshortlink(url) - if isg_sheet(long_url) - - long_url = sheet_handler(long_url) - end - - if isg_drive(long_url) - drive_download(long_url, localdir) +function google_download_url(url::AbstractString) + if is_drive_url(url) + url = drive_download_url(url) + elseif is_sheet_url(url) + url = sheet_download_url(url) else - DataDeps.fetch_http(long_url, localdir) - end -end - -function find_gcode(ckj) - for cookie ∈ ckj - if match(r"_warning_", cookie.name) !== nothing - return cookie.value - end + throw(ArgumentError("Unknown URL form $url")) end - - nothing end +is_sheet_url(url) = occursin("docs.google.com/spreadsheets", url) -function drive_download(url, localdir) - default_ckjar = HTTP.CookieRequest.default_cookiejar - # On newer version of HTTP.jl default_ckjar is an Array with one per thread - ckjar = copy(default_ckjar isa Array ? default_ckjar[Base.Threads.threadid()] : default_ckjar) - rq = HTTP.request("HEAD", url; cookies=true, cookiejar=ckjar) - ckj = ckjar["drive.google.com"] - gcode = find_gcode(ckj) - @assert gcode !== nothing - - format_progress(x) = round(x, digits=4) - format_bytes(x) = !isfinite(x) ? "∞ B" : Base.format_bytes(x) - format_seconds(x) = "$(round(x; digits=2)) s" - format_bytes_per_second(x) = format_bytes(x) * "/s" - - local filepath - newurl = unshortlink("$url&confirm=$gcode"; cookies=true, cookiejar=ckjar) - - - HTTP.open("GET", newurl, ["Range"=>"bytes=0-"]; cookies=true, cookiejar=ckjar) do stream - resp = HTTP.startread(stream) - hcd = HTTP.header(resp, "Content-Disposition") - m = match(r"filename=\\\"(.*)\\\"", hcd) - if m === nothing - filename = "drive_download-$(randstring())" - else - filename = m.captures[] - end - - filepath = joinpath(localdir, filename) - - total_bytes = tryparse(Float64, split(HTTP.header(resp, "Content-Range"), '/')[end]) - total_bytes === nothing && (total_bytes = NaN) - downloaded_bytes = 0 - start_time = now() - prev_time = now() - period = DataDeps.progress_update_period() - - function report_callback() - prev_time = now() - taken_time = (prev_time - start_time).value / 1000 # in seconds - average_speed = downloaded_bytes / taken_time - remaining_bytes = total_bytes - downloaded_bytes - remaining_time = remaining_bytes / average_speed - completion_progress = downloaded_bytes / total_bytes - - @info("Downloading", - source=url, - dest = filepath, - progress = completion_progress |> format_progress, - time_taken = taken_time |> format_seconds, - time_remaining = remaining_time |> format_seconds, - average_speed = average_speed |> format_bytes_per_second, - downloaded = downloaded_bytes |> format_bytes, - remaining = remaining_bytes |> format_bytes, - total = total_bytes |> format_bytes, - ) - end +const _drive_pattern = r"^https?:\/\/drive\.google\.com\/file\/d\/([^\/]*).*" +is_drive_url(url) = occursin(_drive_pattern, url) || occursin("docs.google.com/uc", url) +""" + drive_download_url(url::AbstractString)::String +Convert a GoogleDrive URL of the form +`https://drive.google.com/file/d/XYZ` +to the form needed for raw data download: +`https://docs.google.com/uc?export=download&id=XYZ` +""" +function drive_download_url(url::AbstractString) + full_url = s"https://docs.google.com/uc?export=download&id=\1" + return replace(url, _drive_pattern => full_url) +end - Base.open(filepath, "w") do fh - while(!eof(stream)) - downloaded_bytes += write(fh, readavailable(stream)) - if !isinf(period) - if now() - prev_time > Millisecond(1000*period) - report_callback() - end - end - end - end - report_callback() +""" + sheet_download_url(url::AbstractString, format)::String +Convert a Google Sheets URL of the form +`https://docs.google.com/spreadsheets/d/XYZ/edit` +to the form needed for raw data download: +`https://docs.google.com/spreadsheets/d/XYZ/export?format=FORMAT` +""" +function sheet_download_url(url::AbstractString, format="csv") + link, action = splitdir(url) + if !startswith(action, "export") + url = link * "/export?format=$format" end - filepath + return url end end # Module diff --git a/test/runtests.jl b/test/runtests.jl index f2abe5f..8ddb223 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,13 +1,18 @@ -using GoogleDrive: google_download, google_download_url -#using Downloads: download +using GoogleDrive using Test @testset "GoogleDrive.jl" begin - src = "https://drive.google.com/file/d/XYZ" dst = "https://docs.google.com/uc?export=download&id=XYZ" - @test google_download_url(src) == dst + @test google_download_url("https://drive.google.com/file/d/XYZ") == dst + @test google_download_url("https://drive.google.com/file/d/XYZ/view") == dst @test google_download_url(dst) == dst + + src = "https://docs.google.com/spreadsheets/d/XYZ/edit" + dst = "https://docs.google.com/spreadsheets/d/XYZ/export?format=" + @test google_download_url(src) == dst * "csv" + @test GoogleDrive.sheet_download_url(src, "xlsx") == dst * "xlsx" + @test_throws ArgumentError google_download_url("foo") #=