Merge branch 'master' into ct-viral-classify-2.2.5

broadinstitute · Nov 6, 2024 · db9e84d · db9e84d
2 parents 37f9941 + e498352
commit db9e84d
Showing 1 changed file with 173 additions and 0 deletions.
diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl
@@ -176,6 +176,179 @@ task tar_extract {
     }
 }
 
+task download_from_url {
+    meta {
+        description: "Download a file from a URL. This task exists as a workaround until Terra supports this functionality natively (cromwell already does: https://cromwell.readthedocs.io/en/stable/filesystems/HTTP/). http[s] and ftp supported"
+        volatile: true
+    }
+    input {
+        String url_to_download
+
+        String? output_filename
+        String? additional_wget_opts
+        String  request_method="GET"
+        Int     request_max_retries=1
+
+        String? md5_hash_expected
+        String? md5_hash_expected_file_url
+        Boolean save_response_header_to_file = false
+
+        Int     disk_size = 50
+    }
+
+    parameter_meta {
+      url_to_download: {
+        description: "The URL to download; this is passed to wget"
+      }
+      
+      output_filename: {
+        description: "The filename to use for the downloaded file. This is optional, though it can be helpful in the event the server does not advise on a filename via the 'Content-Disposition' header."
+      }
+      additional_wget_opts: {
+        description: "Additional options passed to wget as part of the download command."
+      }
+      request_method: {
+        description: "The request method ('GET', 'POST', etc.) passed to wget. Optional (default: 'GET')"
+      }
+      request_max_retries: {
+        description: "The maximum number of (additional) re-tries to attempt in the event of failed download."
+      }
+      md5_hash_expected: {
+        description: "The (binary-mode) md5 hash expected for the file to download. If provided and the value does not match the md5 hash of the downloaded file, the task will fail. mutually exclusive with md5_hash_expected_file_url"
+      }
+      md5_hash_expected_file_url: {
+        description: "The url of a file containing the (binary-mode) md5 hash expected for the file to download. If provided and the value does not match the md5 hash of the downloaded file, the task will fail. mutually exclusive with md5_hash_expected"
+      }
+      save_response_header_to_file: {
+        description: "If save_response_header_to_file=true, http response headers will be saved to an output file. Only applicable for http[s] URLs."
+      }
+    }
+
+    String download_subdir_local = "downloaded"
+    command <<<
+        # enforce that only one source of expected md5 hash can be provided
+        ~{if defined(md5_hash_expected) && defined(md5_hash_expected_file_url) then 'echo "The inputs \'md5_hash_expected\' and \'md5_hash_expected_file_url\' cannot both be specified; please provide only one."; exit 1;' else ''}
+
+        mkdir -p "~{download_subdir_local}/tmp"
+        
+        pushd "~{download_subdir_local}"
+        
+        # ---- download desired file
+        pushd "tmp"
+
+        # if a URL-encoded version of the requested download is needed
+        #encoded_url=$(python3 -c "import urllib.parse; print urllib.parse.quote('''~{url_to_download}''')")
+        
+        # get the desired file using wget
+        # --content-disposition = use the file name suggested by the server via the Content-Disposition header
+        # --trust-server-names = ...and in the event of a redirect, use the value of the final page rather than that of the original url
+        # --save-headers = save the headers sent by the HTTP server to the file, preceding the actual contents, with an empty line as the separator.
+        wget \
+        --read-timeout 3 --waitretry 30 \
+        --no-verbose \
+        --method ~{request_method} \
+        ~{if defined(output_filename) then "--output-document ~{output_filename}" else ""} \
+        --tries ~{request_max_retries} \
+        --content-disposition --trust-server-names ~{additional_wget_opts} \
+        '~{url_to_download}' \
+        ~{if save_response_header_to_file then "--save-headers" else ""} || (echo "ERROR: request to ~{request_method} file from URL failed: ~{url_to_download}"; exit 1)
+
+        # ----
+
+        # get the name of the downloaded file
+        downloaded_file_name="$(basename $(ls -1 | head -n1))"
+
+        if [ ! -f "$downloaded_file_name" ]; then
+            echo "Could not locate downloaded file \"$downloaded_file_name\""
+            exit 1
+        fi
+        
+        if [ ! -s "$downloaded_file_name" ]; then
+            echo "Downloaded file appears empty: \"$downloaded_file_name\""
+            exit 1
+        fi
+
+        popd # return to downloaded/
+
+        # (only for http(s)) split http response headers from response body
+        # since wget stores both in a single file separated by a couple newlines
+        if [[ "~{url_to_download}" =~ ^https?:// ]] && ~{if save_response_header_to_file then "true" else "false"}; then
+            echo "Saving response headers separately..."
+            csplit -f response -s tmp/${downloaded_file_name} $'/^\r$/+1' && \
+                mv response00 ../${downloaded_file_name}.headers && \
+                mv response01 ${downloaded_file_name} && \
+                rm "tmp/$downloaded_file_name"
+        else
+            mv tmp/${downloaded_file_name} ${downloaded_file_name}
+        fi
+        # alternative python implementation to split response headers from body
+        #   via https://stackoverflow.com/a/75483099
+        #python3 << CODE
+        #if ~{if save_response_header_to_file then "True" else "False"}:
+        #    with open("tmp/${downloaded_file_name}", "rb") as f_downloaded:
+        #        headers, body = f_downloaded.read().split(b"\r\n\r\n", 1)
+        #        # write the response header to a file
+        #        with open("${downloaded_file_name}.headers", "wb") as f_headers:
+        #            f_headers.write(headers)
+        #            f_headers.write(b"\r\n")
+        #        # save the file body to its final location
+        #        with open("${downloaded_file_name}", "wb") as f:
+        #            f.write(body)
+        #else:
+        #    ## if headers are not being saved, move the file to its final destination
+        #    import shutil
+        #    shutil.move("tmp/${downloaded_file_name}","${downloaded_file_name}")
+        #CODE
+        
+        rm -r "tmp"
+
+        popd # return to job working directory
+
+        check_md5_sum() {
+            # $1 =  md5sum expected
+            # $2 =  md5sum of downloaded file
+            if [[ "$1" != "$2" ]]; then
+                echo "ERROR: md5sum of downloaded file ($2) did not match md5sum expected ($1)";
+                exit 1
+            fi
+        }
+
+        md5sum_of_downloaded=$(md5sum --binary "~{download_subdir_local}/${downloaded_file_name}" | cut -f1 -d' ' | tee MD5_SUM_OF_DOWNLOADED_FILE)
+
+        if ~{if defined(md5_hash_expected) then 'true' else 'false'}; then
+            md5_hash_expected="~{md5_hash_expected}"
+            check_md5_sum $md5_hash_expected $md5sum_of_downloaded
+        fi
+        if ~{if defined(md5_hash_expected_file_url) then 'true' else 'false'}; then
+            md5_hash_expected="$(curl --silent ~{md5_hash_expected_file_url} | cut -f1 -d' ')"
+            check_md5_sum $md5_hash_expected $md5sum_of_downloaded
+        fi
+
+        # report the file size, in bytes
+        printf "Downloaded file size (bytes): " && stat --format=%s  "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES
+    >>>
+    runtime {
+        docker: "quay.io/broadinstitute/viral-baseimage:0.2.0"
+        memory: "2 GB"
+        cpu:    1
+        disks:  "local-disk " + disk_size + " LOCAL"
+        disk: disk_size + " GB" # TES
+        dx_instance_type: "mem1_ssd1_v2_x2"
+        maxRetries: 0
+        preemptible: 1
+    }
+    output {
+        File  downloaded_response_file    = glob("downloaded/*")[0]
+        File? downloaded_response_headers = basename(downloaded_response_file) + ".headers"
+
+        Int    file_size_bytes          = read_int("SIZE_OF_DOWNLOADED_FILE_BYTES")
+        String md5_sum_of_response_file = read_string("MD5_SUM_OF_DOWNLOADED_FILE")
+
+        File stdout = stdout()
+        File stderr = stderr()
+    }
+}
+
 task fasta_to_ids {
     meta {
         description: "Return the headers only from a fasta file"