Skip to content

Commit

Permalink
Merge branch 'master' into ct-viral-classify-2.2.5
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkinsc authored Nov 6, 2024
2 parents 37f9941 + e498352 commit db9e84d
Showing 1 changed file with 173 additions and 0 deletions.
173 changes: 173 additions & 0 deletions pipes/WDL/tasks/tasks_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,179 @@ task tar_extract {
}
}
task download_from_url {
meta {
description: "Download a file from a URL. This task exists as a workaround until Terra supports this functionality natively (cromwell already does: https://cromwell.readthedocs.io/en/stable/filesystems/HTTP/). http[s] and ftp supported"
volatile: true
}
input {
String url_to_download
String? output_filename
String? additional_wget_opts
String request_method="GET"
Int request_max_retries=1
String? md5_hash_expected
String? md5_hash_expected_file_url
Boolean save_response_header_to_file = false
Int disk_size = 50
}
parameter_meta {
url_to_download: {
description: "The URL to download; this is passed to wget"
}
output_filename: {
description: "The filename to use for the downloaded file. This is optional, though it can be helpful in the event the server does not advise on a filename via the 'Content-Disposition' header."
}
additional_wget_opts: {
description: "Additional options passed to wget as part of the download command."
}
request_method: {
description: "The request method ('GET', 'POST', etc.) passed to wget. Optional (default: 'GET')"
}
request_max_retries: {
description: "The maximum number of (additional) re-tries to attempt in the event of failed download."
}
md5_hash_expected: {
description: "The (binary-mode) md5 hash expected for the file to download. If provided and the value does not match the md5 hash of the downloaded file, the task will fail. mutually exclusive with md5_hash_expected_file_url"
}
md5_hash_expected_file_url: {
description: "The url of a file containing the (binary-mode) md5 hash expected for the file to download. If provided and the value does not match the md5 hash of the downloaded file, the task will fail. mutually exclusive with md5_hash_expected"
}
save_response_header_to_file: {
description: "If save_response_header_to_file=true, http response headers will be saved to an output file. Only applicable for http[s] URLs."
}
}
String download_subdir_local = "downloaded"
command <<<
# enforce that only one source of expected md5 hash can be provided
~{if defined(md5_hash_expected) && defined(md5_hash_expected_file_url) then 'echo "The inputs \'md5_hash_expected\' and \'md5_hash_expected_file_url\' cannot both be specified; please provide only one."; exit 1;' else ''}
mkdir -p "~{download_subdir_local}/tmp"
pushd "~{download_subdir_local}"
# ---- download desired file
pushd "tmp"
# if a URL-encoded version of the requested download is needed
#encoded_url=$(python3 -c "import urllib.parse; print urllib.parse.quote('''~{url_to_download}''')")
# get the desired file using wget
# --content-disposition = use the file name suggested by the server via the Content-Disposition header
# --trust-server-names = ...and in the event of a redirect, use the value of the final page rather than that of the original url
# --save-headers = save the headers sent by the HTTP server to the file, preceding the actual contents, with an empty line as the separator.
wget \
--read-timeout 3 --waitretry 30 \
--no-verbose \
--method ~{request_method} \
~{if defined(output_filename) then "--output-document ~{output_filename}" else ""} \
--tries ~{request_max_retries} \
--content-disposition --trust-server-names ~{additional_wget_opts} \
'~{url_to_download}' \
~{if save_response_header_to_file then "--save-headers" else ""} || (echo "ERROR: request to ~{request_method} file from URL failed: ~{url_to_download}"; exit 1)
# ----
# get the name of the downloaded file
downloaded_file_name="$(basename $(ls -1 | head -n1))"
if [ ! -f "$downloaded_file_name" ]; then
echo "Could not locate downloaded file \"$downloaded_file_name\""
exit 1
fi
if [ ! -s "$downloaded_file_name" ]; then
echo "Downloaded file appears empty: \"$downloaded_file_name\""
exit 1
fi
popd # return to downloaded/
# (only for http(s)) split http response headers from response body
# since wget stores both in a single file separated by a couple newlines
if [[ "~{url_to_download}" =~ ^https?:// ]] && ~{if save_response_header_to_file then "true" else "false"}; then
echo "Saving response headers separately..."
csplit -f response -s tmp/${downloaded_file_name} $'/^\r$/+1' && \
mv response00 ../${downloaded_file_name}.headers && \
mv response01 ${downloaded_file_name} && \
rm "tmp/$downloaded_file_name"
else
mv tmp/${downloaded_file_name} ${downloaded_file_name}
fi
# alternative python implementation to split response headers from body
# via https://stackoverflow.com/a/75483099
#python3 << CODE
#if ~{if save_response_header_to_file then "True" else "False"}:
# with open("tmp/${downloaded_file_name}", "rb") as f_downloaded:
# headers, body = f_downloaded.read().split(b"\r\n\r\n", 1)
# # write the response header to a file
# with open("${downloaded_file_name}.headers", "wb") as f_headers:
# f_headers.write(headers)
# f_headers.write(b"\r\n")
# # save the file body to its final location
# with open("${downloaded_file_name}", "wb") as f:
# f.write(body)
#else:
# ## if headers are not being saved, move the file to its final destination
# import shutil
# shutil.move("tmp/${downloaded_file_name}","${downloaded_file_name}")
#CODE
rm -r "tmp"
popd # return to job working directory
check_md5_sum() {
# $1 = md5sum expected
# $2 = md5sum of downloaded file
if [[ "$1" != "$2" ]]; then
echo "ERROR: md5sum of downloaded file ($2) did not match md5sum expected ($1)";
exit 1
fi
}
md5sum_of_downloaded=$(md5sum --binary "~{download_subdir_local}/${downloaded_file_name}" | cut -f1 -d' ' | tee MD5_SUM_OF_DOWNLOADED_FILE)
if ~{if defined(md5_hash_expected) then 'true' else 'false'}; then
md5_hash_expected="~{md5_hash_expected}"
check_md5_sum $md5_hash_expected $md5sum_of_downloaded
fi
if ~{if defined(md5_hash_expected_file_url) then 'true' else 'false'}; then
md5_hash_expected="$(curl --silent ~{md5_hash_expected_file_url} | cut -f1 -d' ')"
check_md5_sum $md5_hash_expected $md5sum_of_downloaded
fi
# report the file size, in bytes
printf "Downloaded file size (bytes): " && stat --format=%s "~{download_subdir_local}/${downloaded_file_name}" | tee SIZE_OF_DOWNLOADED_FILE_BYTES
>>>
runtime {
docker: "quay.io/broadinstitute/viral-baseimage:0.2.0"
memory: "2 GB"
cpu: 1
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TES
dx_instance_type: "mem1_ssd1_v2_x2"
maxRetries: 0
preemptible: 1
}
output {
File downloaded_response_file = glob("downloaded/*")[0]
File? downloaded_response_headers = basename(downloaded_response_file) + ".headers"
Int file_size_bytes = read_int("SIZE_OF_DOWNLOADED_FILE_BYTES")
String md5_sum_of_response_file = read_string("MD5_SUM_OF_DOWNLOADED_FILE")
File stdout = stdout()
File stderr = stderr()
}
}
task fasta_to_ids {
meta {
description: "Return the headers only from a fasta file"
Expand Down

0 comments on commit db9e84d

Please sign in to comment.