-
Notifications
You must be signed in to change notification settings - Fork 359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
BT-732 Checksum validation for blobs read by engine #6838
Changes from 10 commits
f89854a
8c30dc7
2533e19
7e981d2
e31be35
c4f4c34
f67aae0
a87c15d
e570439
7c6cd13
bdbd0fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ import cromwell.core.path.Path | |
import cromwell.engine.io.IoActor._ | ||
import cromwell.engine.io.RetryableRequestSupport.{isInfinitelyRetryable, isRetryable} | ||
import cromwell.engine.io.{IoAttempts, IoCommandContext, IoCommandStalenessBackpressuring} | ||
import cromwell.filesystems.blob.BlobPath | ||
import cromwell.filesystems.drs.DrsPath | ||
import cromwell.filesystems.gcs.GcsPath | ||
import cromwell.filesystems.s3.S3Path | ||
|
@@ -128,21 +129,33 @@ class NioFlow(parallelism: Int, | |
|
||
def readFileAndChecksum: IO[String] = { | ||
for { | ||
fileHash <- getHash(command.file) | ||
fileHash <- getStoredHash(command.file) | ||
uncheckedValue <- readFile | ||
checksumResult <- checkHash(uncheckedValue, fileHash) | ||
checksumResult <- fileHash match { | ||
case Some(hash) => checkHash(uncheckedValue, hash) | ||
// If there is no stored checksum, don't attempt to validate. | ||
// If the missing checksum is itself an error condition, that | ||
// should be detected by the code that gets the FileHash. | ||
case None => IO.pure(ChecksumSkipped()) | ||
} | ||
verifiedValue <- checksumResult match { | ||
case _: ChecksumSkipped => IO.pure(uncheckedValue) | ||
case _: ChecksumSuccess => IO.pure(uncheckedValue) | ||
case failure: ChecksumFailure => IO.raiseError( | ||
ChecksumFailedException( | ||
s"Failed checksum for '${command.file}'. Expected '${fileHash.hashType}' hash of '${fileHash.hash}'. Calculated hash '${failure.calculatedHash}'")) | ||
fileHash match { | ||
case Some(hash) => s"Failed checksum for '${command.file}'. Expected '${hash.hashType}' hash of '${hash.hash}'. Calculated hash '${failure.calculatedHash}'" | ||
case None => s"Failed checksum for '${command.file}'. Couldn't find stored file hash." // This should never happen | ||
} | ||
) | ||
) | ||
} | ||
} yield verifiedValue | ||
} | ||
|
||
val fileContentIo = command.file match { | ||
case _: DrsPath => readFileAndChecksum | ||
case _: DrsPath => readFileAndChecksum | ||
case _: BlobPath => readFileAndChecksum | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could have done:
...but this seemed more readable to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree! |
||
case _ => readFile | ||
} | ||
fileContentIo.map(_.replaceAll("\\r\\n", "\\\n")) | ||
|
@@ -153,19 +166,27 @@ class NioFlow(parallelism: Int, | |
} | ||
|
||
private def hash(hash: IoHashCommand): IO[String] = { | ||
getHash(hash.file).map(_.hash) | ||
// If there is no hash accessible from the file storage system, | ||
// we'll read the file and generate the hash ourselves. | ||
getStoredHash(hash.file).flatMap { | ||
case Some(storedHash) => IO.pure(storedHash) | ||
case None => generateMd5FileHashForPath(hash.file) | ||
}.map(_.hash) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This "read file to generate hash" functionality used to be part of |
||
} | ||
|
||
private def getHash(file: Path): IO[FileHash] = { | ||
private def getStoredHash(file: Path): IO[Option[FileHash]] = { | ||
file match { | ||
case gcsPath: GcsPath => getFileHashForGcsPath(gcsPath) | ||
case gcsPath: GcsPath => getFileHashForGcsPath(gcsPath).map(Option(_)) | ||
case blobPath: BlobPath => getFileHashForBlobPath(blobPath) | ||
case drsPath: DrsPath => IO { | ||
// We assume all DRS files have a stored hash; this will throw | ||
// if the file does not. | ||
drsPath.getFileHash | ||
} | ||
}.map(Option(_)) | ||
case s3Path: S3Path => IO { | ||
FileHash(HashType.S3Etag, s3Path.eTag) | ||
Option(FileHash(HashType.S3Etag, s3Path.eTag)) | ||
} | ||
case path => getMd5FileHashForPath(path) | ||
case _ => IO.pure(None) | ||
} | ||
} | ||
|
||
|
@@ -201,7 +222,11 @@ class NioFlow(parallelism: Int, | |
gcsPath.objectBlobId.map(id => FileHash(HashType.GcsCrc32c, gcsPath.cloudStorage.get(id).getCrc32c)) | ||
} | ||
|
||
private def getMd5FileHashForPath(path: Path): IO[FileHash] = delayedIoFromTry { | ||
private def getFileHashForBlobPath(blobPath: BlobPath): IO[Option[FileHash]] = delayedIoFromTry { | ||
blobPath.md5HexString.map(md5 => md5.map(FileHash(HashType.Md5, _))) | ||
} | ||
|
||
private def generateMd5FileHashForPath(path: Path): IO[FileHash] = delayedIoFromTry { | ||
tryWithResource(() => path.newInputStream) { inputStream => | ||
FileHash(HashType.Md5, org.apache.commons.codec.digest.DigestUtils.md5Hex(inputStream)) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
package cromwell.filesystems.blob | ||
|
||
import com.azure.core.credential.AzureSasCredential | ||
import com.azure.storage.blob.nio.AzureFileSystem | ||
import com.azure.storage.blob.nio.{AzureBlobFileAttributes, AzureFileSystem} | ||
import com.google.common.net.UrlEscapers | ||
import cromwell.core.path.{NioPath, Path, PathBuilder} | ||
import cromwell.filesystems.blob.BlobPathBuilder._ | ||
|
||
import java.net.{MalformedURLException, URI} | ||
import java.nio.file.{FileSystem, FileSystemNotFoundException, FileSystems} | ||
import java.nio.file.{FileSystem, FileSystemNotFoundException, FileSystems, Files} | ||
import scala.jdk.CollectionConverters._ | ||
import scala.language.postfixOps | ||
import scala.util.{Failure, Try} | ||
|
@@ -90,4 +90,19 @@ case class BlobPath private[blob](nioPath: NioPath, endpoint: String, container: | |
override def pathAsString: String = List(endpoint, container, nioPath.toString()).mkString("/") | ||
|
||
override def pathWithoutScheme: String = parseURI(endpoint).getHost + "/" + container + "/" + nioPath.toString() | ||
|
||
def blobFileAttributes: Try[AzureBlobFileAttributes] = | ||
Try(Files.readAttributes(nioPath, classOf[AzureBlobFileAttributes])) | ||
|
||
def md5HexString: Try[Option[String]] = { | ||
blobFileAttributes.map(h => | ||
Option(h.blobHttpHeaders().getContentMd5) match { | ||
case None => None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, exactly. Or, I'm not confident that it's impossible for it to return |
||
case Some(arr) if arr.isEmpty => None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is really interesting, I hadn't realized you could pattern match like this! |
||
// Convert the bytes to a hex-encoded string. Note that this value | ||
// is rendered in base64 in the Azure web portal. | ||
case Some(bytes) => Option(bytes.map("%02x".format(_)).mkString) | ||
} | ||
) | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could also push this
match
intocheckHash
, but then the logic for "what do we do when there is or isn't a stored hash" is spread acrosscheckHash
and this function which still has the logic for constructing the exception message... one case of which should never happen. 🤷♂️There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I actually had it that way originally and then changed it. It made kind of a mess in
checkHash
. It felt better to let that method just... check the hash.I do not like having that should-never-happen case in the exception creation, sigh... the alternative was filling the string generation of the exception message with
${hash.map(_.hashType).getOrElse("<MISSING>")}
, which made it really long and hard to read. I could also create these strings as vals before generating the string and then plug then in, so all themap
ing andgetOrElse
ing doesn't obscure the error message... that didn't occur to me before. What do you think?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Crap, I just lost track of this thread between one moment and the next and merged this PR. Happy to submit another tiny one if you have a strong preference for handling the error differently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nah, I don't think it's worth the time. I approved the PR without condition of changes, so merging it as-is was perfectly fine.