From e1617c571c5205ac38acb809a1246f7b6c60fbcb Mon Sep 17 00:00:00 2001 From: gothub Date: Wed, 18 Nov 2020 09:04:34 -0800 Subject: [PATCH] Don't recalculate checksum on package download (new default case) (#261) --- R/D1Client.R | 42 +++++++++++++++++++++------------- man/getDataObject.Rd | 7 +++--- man/getDataPackage.Rd | 6 ++--- tests/testthat/test.D1Client.R | 2 +- 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/R/D1Client.R b/R/D1Client.R index 56b69d6..1eac325 100644 --- a/R/D1Client.R +++ b/R/D1Client.R @@ -252,8 +252,9 @@ setMethod("getD1Object", "D1Client", function(x, identifier) { #' @param limit A \code{character} value specifying maximum package member size to download. Specified with "KB", "MB" or "TB" #' for example: "100KB", "10MB", "20GB", "1TB". The default is "1MB". Only takes effect if 'lazyLoad=FALSE'. #' @param quiet A \code{'logical'}. If TRUE (the default) then informational messages will not be printed. -#' @param checksumAlgorithm A \code{character} value specifying the algorithm to use to calculate the system metadata check -#' for the object's data bytes for example: "SHA-256" +#' @param checksumAlgorithm A \code{character} value specifying the algorithm to use to re-calculate (after download) the system metadata checksum +#' for the object's data bytes for example: "SHA-256". The default is "NA", which specifies that this +#' re-calculation will not be performed. #' @param ... (not yet used) #' @rdname getDataObject #' @aliases getDataObject @@ -274,7 +275,7 @@ setGeneric("getDataObject", function(x, identifier, ...) { #' @rdname getDataObject #' @export setMethod("getDataObject", "D1Client", function(x, identifier, lazyLoad=FALSE, limit="1MB", quiet=TRUE, - checksumAlgorithm="SHA-256") { + checksumAlgorithm=as.character(NA)) { # Resolve the object location # This service is too chatty if any of the locations aren't available @@ -373,20 +374,29 @@ setMethod("getDataObject", "D1Client", function(x, identifier, lazyLoad=FALSE, l # If the checksum for current object does not match the requested checksum algorithm, then calculate # it if the object's data is local, otherwise ask DataONE to calculate it. - if(tolower(sysmeta@checksumAlgorithm) != tolower(checksumAlgorithm)) { - # Bytes were not downloaded into the DataObject - if (deferredDownload) { - checksum = getChecksum(currentMN, pid=identifier, checksumAlgorithm=checksumAlgorithm) - sysmeta@checksum = checksum - sysmeta@checksumAlgorithm <- checksumAlgorithm - cat(sprintf("got new %s checksum from dataone for id %s: %s\n", checksumAlgorithm, identifier, checksum)) - } - } + if(!is.na(checksumAlgorithm)) { + if(tolower(sysmeta@checksumAlgorithm) != tolower(checksumAlgorithm)) { + # Bytes were not downloaded into the DataObject + if (deferredDownload) { + checksum = getChecksum(currentMN, pid=identifier, checksumAlgorithm=checksumAlgorithm) + sysmeta@checksum = checksum + sysmeta@checksumAlgorithm <- checksumAlgorithm + } + if(!quiet) { + cat(sprintf("Fetched recalculated checksum from DataONE using %s for id %s: %s\n", checksumAlgorithm, identifier, checksum)) + } + } + } else { + # The checksum algorithm was set to NA, meaning don't re-calculate the checksum after download. Since the + # DataObject constructor requires an algorithm to be specified, use the algorithm specified in the + # system metadata that was just downloaded for this object. + checksumAlgorithm <- sysmeta@checksumAlgorithm + } # Construct and return a DataObject # Notice that we are passing the existing sysmeta for this object via the 'id' parameter, # which will cause the DataObject to use this sysmeta and not generate a new one. However, if the - # sysmeta checksum algorithm is different thatn the pecified algorithm, then the checksum will + # sysmeta checksum algorithm is different than the specified algorithm, then the checksum will # be recalculated. do <- new("DataObject", id=sysmeta, dataobj=bytes, dataURL=dataURL, checksumAlgorithm=checksumAlgorithm) @@ -437,12 +447,12 @@ setGeneric("getDataPackage", function(x, identifier, ...) { #' @param limit A \code{character} value specifying maximum package member size to download. Specified with "KB", "MB" or "TB" #' for example: "100KB", "10MB", "20GB", "1TB". The default is "1MB". Only takes effect if 'lazyLoad=FALSE'. #' @param quiet A \code{'logical'}. If TRUE (the default) then informational messages will not be printed. -#' @param checksumAlgorithm A \code{character} value specifying the algorithm to use to calculate the system metadata check -#' for the object's data bytes for example: "SHA-256" +#' @param checksumAlgorithm A \code{character} value specifying the algorithm to use to re-calculate (after download) the system metadata checksum +#' for the object's data bytes for example: "SHA-256". The default is "NA", which specifies that this re-calculation will not be performed. #' @param ... (not yet used) #' @export setMethod("getDataPackage", "D1Client", function(x, identifier, lazyLoad=FALSE, limit="1MB", quiet=TRUE, - checksumAlgorithm="SHA-256") { + checksumAlgorithm=as.character(NA)) { # The identifier provided could be the package id (resource map), the metadata id or a package member (data, etc) # The solr queries attempt to determine which id was specified and may issue additional queries to get all the diff --git a/man/getDataObject.Rd b/man/getDataObject.Rd index 776f72d..a577b98 100644 --- a/man/getDataObject.Rd +++ b/man/getDataObject.Rd @@ -13,7 +13,7 @@ getDataObject(x, identifier, ...) lazyLoad = FALSE, limit = "1MB", quiet = TRUE, - checksumAlgorithm = "SHA-256" + checksumAlgorithm = as.character(NA) ) } \arguments{ @@ -30,8 +30,9 @@ for example: "100KB", "10MB", "20GB", "1TB". The default is "1MB". Only takes ef \item{quiet}{A \code{'logical'}. If TRUE (the default) then informational messages will not be printed.} -\item{checksumAlgorithm}{A \code{character} value specifying the algorithm to use to calculate the system metadata check -for the object's data bytes for example: "SHA-256"} +\item{checksumAlgorithm}{A \code{character} value specifying the algorithm to use to re-calculate (after download) the system metadata checksum +for the object's data bytes for example: "SHA-256". The default is "NA", which specifies that this +re-calculation will not be performed.} } \value{ A DataObject or NULL if the object was not found in DataONE diff --git a/man/getDataPackage.Rd b/man/getDataPackage.Rd index c0bcfc1..25b4b20 100644 --- a/man/getDataPackage.Rd +++ b/man/getDataPackage.Rd @@ -13,7 +13,7 @@ getDataPackage(x, identifier, ...) lazyLoad = FALSE, limit = "1MB", quiet = TRUE, - checksumAlgorithm = "SHA-256" + checksumAlgorithm = as.character(NA) ) } \arguments{ @@ -31,8 +31,8 @@ for example: "100KB", "10MB", "20GB", "1TB". The default is "1MB". Only takes ef \item{quiet}{A \code{'logical'}. If TRUE (the default) then informational messages will not be printed.} -\item{checksumAlgorithm}{A \code{character} value specifying the algorithm to use to calculate the system metadata check -for the object's data bytes for example: "SHA-256"} +\item{checksumAlgorithm}{A \code{character} value specifying the algorithm to use to re-calculate (after download) the system metadata checksum +for the object's data bytes for example: "SHA-256". The default is "NA", which specifies that this re-calculation will not be performed.} } \value{ A DataPackage or NULL if the package was not found in DataONE diff --git a/tests/testthat/test.D1Client.R b/tests/testthat/test.D1Client.R index 910ee3c..cc2b150 100644 --- a/tests/testthat/test.D1Client.R +++ b/tests/testthat/test.D1Client.R @@ -87,7 +87,7 @@ test_that("D1Client getDataObject", { # Try retrieving a known object from the PROD environment pid <- "solson.5.1" - obj <- getDataObject(d1cKNB, pid) + obj <- getDataObject(d1cKNB, pid, checksumAlgorithm="SHA-256") cname <- class(obj)[1] expect_match(cname, "DataObject") expect_match(class(obj@sysmeta), "SystemMetadata")