From e498352734b2f7b7673d777032b36a309a244a83 Mon Sep 17 00:00:00 2001
From: Chris Tomkins-Tinch <tomkinsc@broadinstitute.org>
Date: Wed, 6 Nov 2024 17:03:28 -0500
Subject: [PATCH] add task: download_from_url() to download a URL to a file
 (#562)

* add task: download_from_web() to download a URL to a file

Download a URL to a file. This task exists as a workaround until Terra supports this functionality natively
cromwell already supports this: https://cromwell.readthedocs.io/en/stable/filesystems/HTTP/

* download_from_web task: allow user to specify the output filename

* add md5-based file integrity checkng to download_from_url, and option to save http response headers

add file integrity checkng to download_from_url by comparing against an md5 checksum provided as a string or via an additional URL, as well as an option to save http response headers

* add read failure timeout to wget call in download_from_url

add read failure timeout to wget call in download_from_url; this causes wget to retry in the event a download hangs. This has been found to resolve an issue with some ftp downloads stalling at 100% without finalizing.

* additional input param documentation for download_from_url
---
 pipes/WDL/tasks/tasks_utils.wdl | 173 ++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)

diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl
index a527d5607..ac9d3553a 100644
--- a/pipes/WDL/tasks/tasks_utils.wdl
+++ b/pipes/WDL/tasks/tasks_utils.wdl
@@ -176,6 +176,179 @@ task tar_extract {
     }
 }
 
+task download_from_url {
+    meta {
+        description: "Download a file from a URL. This task exists as a workaround until Terra supports this functionality natively (cromwell already does: https://cromwell.readthedocs.io/en/stable/filesystems/HTTP/). http[s] and ftp supported"
+        volatile: true
+    }
+    input {
+        String url_to_download
+
+        String? output_filename
+        String? additional_wget_opts
+        String  request_method="GET"
+        Int     request_max_retries=1
+
+        String? md5_hash_expected
+        String? md5_hash_expected_file_url
+        Boolean save_response_header_to_file = false
+
+        Int     disk_size = 50
+    }
+
+    parameter_meta {
+      url_to_download: {
+        description: "The URL to download; this is passed to wget"
+      }
+      
+      output_filename: {
+        description: "The filename to use for the downloaded file. This is optional, though it can be helpful in the event the server does not advise on a filename via the 'Content-Disposition' header."
+      }
+      additional_wget_opts: {
+        description: "Additional options passed to wget as part of the download command."
+      }
+      request_method: {
+        description: "The request method ('GET', 'POST', etc.) passed to wget. Optional (default: 'GET')"
+      }
+      request_max_retries: {
+        description: "The maximum number of (additional) re-tries to attempt in the event of failed download."
+      }
+      md5_hash_expected: {
+        description: "The (binary-mode) md5 hash expected for the file to download. If provided and the value does not match the md5 hash of the downloaded file, the task will fail. mutually exclusive with md5_hash_expected_file_url"
+      }
+      md5_hash_expected_file_url: {
+        description: "The url of a file containing the (binary-mode) md5 hash expected for the file to download. If provided and the value does not match the md5 hash of the downloaded file, the task will fail. mutually exclusive with md5_hash_expected"
+      }
+      save_response_header_to_file: {
+        description: "If save_response_header_to_file=true, http response headers will be saved to an output file. Only applicable for http[s] URLs."
+      }
+    }
+
+    String download_subdir_local = "downloaded"
+    command <<<
+        # enforce that only one source of expected md5 hash can be provided
+        ~{if defined(md5_hash_expected) && defined(md5_hash_expected_file_url) then 'echo "The inputs \'md5_hash_expected\' and \'md5_hash_expected_file_url\' cannot both be specified; please provide only one."; exit 1;' else ''}
+
+        mkdir -p "~{download_subdir_local}/tmp"
+        
+        pushd "~{download_subdir_local}"
+        
+        # ---- download desired file
+        pushd "tmp"
+
+        # if a URL-encoded version of the requested download is needed
+        #encoded_url=$(python3 -c "import urllib.parse; print urllib.parse.quote('''~{url_to_download}''')")
+        
+        # get the desired file using wget
+        # --content-disposition = use the file name suggested by the server via the Content-Disposition header
+        # --trust-server-names = ...and in the event of a redirect, use the value of the final page rather than that of the original url
+        # --save-headers = save the headers sent by the HTTP server to the file, preceding the actual contents, with an empty line as the separator.
+        wget \
+        --read-timeout 3 --waitretry 30 \
+        --no-verbose \
+        --method ~{request_method} \
+        ~{if defined(output_filename) then "--output-document ~{output_filename}" else ""} \
+        --tries ~{request_max_retries} \
+        --content-disposition --trust-server-names ~{additional_wget_opts} \
+        '~{url_to_download}' \
+        ~{if save_response_header_to_file then "--save-headers" else ""} || (echo "ERROR: request to ~{request_method} file from URL failed: ~{url_to_download}"; exit 1)
+
+        # ----
+
+        # get the name of the downloaded file
+        downloaded_file_name="$(basename $(ls -1 | head -n1))"
+
+        if [ ! -f "$downloaded_file_name" ]; then
+            echo "Could not locate downloaded file \"$downloaded_file_name\""
+            exit 1
+        fi
+        
+        if [ ! -s "$downloaded_file_name" ]; then
+            echo "Downloaded file appears empty: \"$downloaded_file_name\""
+            exit 1
+        fi
+
+        popd # return to downloaded/
+
+        # (only for http(s)) split http response headers from response body
+        # since wget stores both in a single file separated by a couple newlines
+        if [[ "~{url_to_download}" =~ ^https?:// ]] && ~{if save_response_header_to_file then "true" else "false"}; then
+            echo "Saving response headers separately..."
+            csplit -f response -s tmp/${downloaded_file_name} $'/^\r$/+1' && \
+                mv response00 ../${downloaded_file_name}.headers && \
+                mv response01 ${downloaded_file_name} && \
+                rm "tmp/$downloaded_file_name"
+        else
+            mv tmp/${downloaded_file_name} ${downloaded_file_name}
+        fi
+        # alternative python implementation to split response headers from body
+        #   via https://stackoverflow.com/a/75483099
+        #python3 << CODE
+        #if ~{if save_response_header_to_file then "True" else "False"}:
+        #    with open("tmp/${downloaded_file_name}", "rb") as f_downloaded:
+        #        headers, body = f_downloaded.read().split(b"\r\n\r\n", 1)
+        #        # write the response header to a file
+        #        with open("${downloaded_file_name}.headers", "wb") as f_headers:
+        #            f_headers.write(headers)
+        #            f_headers.write(b"\r\n")
+        #        # save the file body to its final location
+        #        with open("${downloaded_file_name}", "wb") as f:
+        #            f.write(body)
+        #else:
+        #    ## if headers are not being saved, move the file to its final destination
+        #    import shutil
+        #    shutil.move("tmp/${downloaded_file_name}","${downloaded_file_name}")
+        #CODE
+        
+        rm -r "tmp"
+
+        popd # return to job working directory
+
+        check_md5_sum() {
+            # $1 =  md5sum expected
+            # $2 =  md5sum of downloaded file
+            if [[ "$1" != "$2" ]]; then
+                echo "ERROR: md5sum of downloaded file ($2) did not match md5sum expected ($1)";
+                exit 1
+            fi
+        }
+
+        md5sum_of_downloaded=$(md5sum --binary "~{download_subdir_local}/${downloaded_file_name}" | cut -f1 -d' ' | tee MD5_SUM_OF_DOWNLOADED_FILE)
+
+        if ~{if defined(md5_hash_expected) then 'true' else 'false'}; then
+            md5_hash_expected="~{md5_hash_expected}"
+            check_md5_sum $md5_hash_expected $md5sum_of_downloaded
+        fi
+        if ~{if defined(md5_hash_expected_file_url) then 'true' else 'false'}; then
+            md5_hash_expected="$(curl --silent ~{md5_hash_expected_file_url} | cut -f1 -d' ')"
+            check_md5_sum $md5_hash_expected $md5sum_of_downloaded
+        fi
+
+        # report the file size, in bytes
+        printf "Downloaded file size (bytes): " && stat --format=%s  "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES
+    >>>
+    runtime {
+        docker: "quay.io/broadinstitute/viral-baseimage:0.2.0"
+        memory: "2 GB"
+        cpu:    1
+        disks:  "local-disk " + disk_size + " LOCAL"
+        disk: disk_size + " GB" # TES
+        dx_instance_type: "mem1_ssd1_v2_x2"
+        maxRetries: 0
+        preemptible: 1
+    }
+    output {
+        File  downloaded_response_file    = glob("downloaded/*")[0]
+        File? downloaded_response_headers = basename(downloaded_response_file) + ".headers"
+
+        Int    file_size_bytes          = read_int("SIZE_OF_DOWNLOADED_FILE_BYTES")
+        String md5_sum_of_response_file = read_string("MD5_SUM_OF_DOWNLOADED_FILE")
+
+        File stdout = stdout()
+        File stderr = stderr()
+    }
+}
+
 task fasta_to_ids {
     meta {
         description: "Return the headers only from a fasta file"