-
Notifications
You must be signed in to change notification settings - Fork 359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[WX-499] DRS Parallel Downloads Follow-up #7229
Changes from all commits
447c13f
09cbc14
275480a
b5a5975
6f5c7d6
c33054c
6d2173d
2b114af
5a9f5a9
7bd8b25
74b718b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,17 @@ | ||
package drs.localizer.downloaders | ||
|
||
import cats.effect.{ExitCode, IO} | ||
import cloud.nio.impl.drs.{AccessUrl, DrsResolverResponse} | ||
import cloud.nio.impl.drs.{AccessUrl} | ||
import com.typesafe.scalalogging.StrictLogging | ||
|
||
import java.nio.charset.StandardCharsets | ||
import java.nio.file.{Files, Path, Paths} | ||
import scala.sys.process.{Process, ProcessLogger} | ||
import scala.util.matching.Regex | ||
import drs.localizer.ResolvedDrsUrl | ||
import spray.json.DefaultJsonProtocol.{StringJsonFormat, listFormat, mapFormat} | ||
import spray.json._ | ||
|
||
case class GetmResult(returnCode: Int, stderr: String) | ||
/** | ||
* Getm is a python tool that is used to download resolved DRS uris quickly and in parallel. | ||
|
@@ -40,39 +43,25 @@ case class BulkAccessUrlDownloader(resolvedUrls : List[ResolvedDrsUrl]) extends | |
* @return Filepath of a getm-manifest.json that Getm can use to download multiple files in parallel. | ||
*/ | ||
def generateJsonManifest(resolvedUrls : List[ResolvedDrsUrl]): IO[Path] = { | ||
def toJsonString(drsResponse: DrsResolverResponse, destinationFilepath: String): String = { | ||
//NB: trailing comma is being removed in generateJsonManifest | ||
val accessUrl: AccessUrl = drsResponse.accessUrl.getOrElse(AccessUrl("missing", None)) | ||
drsResponse.hashes.map(_ => { | ||
val checksum = GetmChecksum(drsResponse.hashes, accessUrl).value.getOrElse("error_calculating_checksum") | ||
val checksumAlgorithm = GetmChecksum(drsResponse.hashes, accessUrl).getmAlgorithm | ||
s""" { | ||
| "url" : "${accessUrl.url}", | ||
| "filepath" : "$destinationFilepath", | ||
| "checksum" : "$checksum", | ||
| "checksum-algorithm" : "$checksumAlgorithm" | ||
| }, | ||
|""".stripMargin | ||
}).getOrElse( | ||
s""" { | ||
| "url" : "${accessUrl.url}", | ||
| "filepath" : "$destinationFilepath" | ||
| }, | ||
|""".stripMargin | ||
) | ||
} | ||
IO { | ||
var jsonString: String = "[\n" | ||
for (resolvedUrl <- resolvedUrls) { | ||
jsonString += toJsonString(resolvedUrl.drsResponse, resolvedUrl.downloadDestinationPath) | ||
} | ||
if(jsonString.contains(',')) { | ||
//remove trailing comma from array elements, but don't crash on empty list. | ||
jsonString = jsonString.substring(0, jsonString.lastIndexOf(",")) | ||
} | ||
jsonString += "\n]" | ||
Files.write(getmManifestPath, jsonString.getBytes(StandardCharsets.UTF_8)) | ||
def resolvedUrlToJsonMap(resolvedUrl: ResolvedDrsUrl): Map[String,String] = { | ||
val accessUrl: AccessUrl = resolvedUrl.drsResponse.accessUrl.getOrElse(AccessUrl("missing", None)) | ||
resolvedUrl.drsResponse.hashes.map{_ => | ||
val checksum = GetmChecksum(resolvedUrl.drsResponse.hashes, accessUrl).value.getOrElse("error_calculating_checksum") | ||
val checksumAlgorithm = GetmChecksum(resolvedUrl.drsResponse.hashes, accessUrl).getmAlgorithm | ||
Map( | ||
("url", accessUrl.url), | ||
("filepath", resolvedUrl.downloadDestinationPath), | ||
("checksum", checksum), | ||
("checksum-algorithm", checksumAlgorithm) | ||
) | ||
}.getOrElse(Map( | ||
("url", accessUrl.url), | ||
("filepath", resolvedUrl.downloadDestinationPath) | ||
)) | ||
} | ||
|
||
val jsonArray: String = resolvedUrls.map(resolved => resolvedUrlToJsonMap(resolved)).toJson.prettyPrint | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not a blocker for this PR, just TOL - I wonder how many files we could handle before we wouldn't want to handle this as a single string. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting question! Would that bring us back to manual json-string-munging so we could stream bytes into a file?? ;) Back of the napkin math: a long DRS URL might be 1,000 characters. Java strings are 40 + length bytes. Call it 1kb per DRS url. 100MB of RAM gets us ~100,000 DRS URLs. Seems like we would need some pretty extreme inputs to cause a problem. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would probably bring us to a streaming write using Spray - surely there's a way to stream bytes into an open files using |
||
IO(Files.write(getmManifestPath, jsonArray.getBytes(StandardCharsets.UTF_8))) | ||
} | ||
|
||
def deleteJsonManifest() = { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🎉 🎉 🎉