feat: further improvements

nahid18 · Sep 26, 2023 · e4091e2 · e4091e2
1 parent 5aaae8d
commit e4091e2
Show file tree

Hide file tree

Showing 8 changed files with 113 additions and 60 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: easylift
 Title: An R package to perform genomic liftover
-Version: 0.99.8
+Version: 0.99.9
 Date: 2023-09-25
 Authors@R: 
     c(

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,4 @@
-# easylift 0.99.7
+# easylift 0.99.9
 
 * Bug fixes and further improvements to the code.
 

diff --git a/R/easylift.R b/R/easylift.R
@@ -8,7 +8,7 @@
 #' @param chain The path to the chain file containing the liftover mapping.
 #' Can be provided in gzipped or non-gzipped format. If omitted, the function
 #' will look in the default BiocFileCache for a properly named chain file.
-#' @param bfc A BiocFileCache object, if not provided (most typically)
+#' @param bfc A BiocFileCache object (optional), if not provided (most typically)
 #' the default location will be used.
 #'
 #' @return A GRanges object with lifted genomic coordinates.
@@ -21,21 +21,31 @@
 #'   seqname = Rle(c("chr1", "chr2"), c(100000, 100000)),
 #'   ranges = IRanges(start = 1, end = 200000)
 #' )
+#' # Here, "hg19" is the source genome
 #' genome(gr) <- "hg19"
 #'
 #' # Here, we use the `system.file()` function because the chain file is in the
 #' # package (however if you need to point to any other file on your machine,
 #' # just do 'chain <- "path/to/your/hg19ToHg38.over.chain.gz"'):
 #' chain <- system.file("extdata", "hg19ToHg38.over.chain.gz", package = "easylift")
+#'
+#' # Here, "hg38" is the target genome
 #' easylift(gr, "hg38", chain)
 #'
 #' \dontrun{
-#' # To use `BiocFileCache` for the chain file, you can add it to the cache as follows:
+#' # To use `BiocFileCache` for the chain file, add it to the cache as follows:
 #' chain_file <- "/path/to/your/hg19ToHg38.over.chain.gz"
 #' bfc <- BiocFileCache()
-#' bfcadd(bfc, chain_file)
-#' # Then, you can use it in `easylift` like this:
+#'
+#' # Add chain file to cache if already not available
+#' if (nrow(bfcquery(bfc, basename(chain_file))) == 0)
+#'    bfcadd(bfc, chain_file)
+#'
+#' # Then, use it in `easylift` like this:
+#'
 #' easylift(gr, "hg38")
+#' # or
+#' gr |> easylift("hg38")
 #' }
 #' @import GenomicRanges
 #' @import GenomeInfoDb
@@ -71,14 +81,7 @@ easylift <- function(x, to, chain, bfc) {
     if (missing(bfc)) {
       bfc <- BiocFileCache()
     }
-    capTo <- paste0(toupper(substr(to, 1, 1)), substr(to, 2, nchar(to)))
-    trychainfile <- paste0(unique_genomes, "To", capTo, ".over.chain")
-    q <- bfcquery(bfc, trychainfile)
-    if (nrow(q) >= 1) {
-      chain <- bfc[[q$rid[1]]]
-    } else {
-      stop(trychainfile, " file not found!")
-    }
+    chain <- .get_chain_from_BiocFileCache(unique_genomes, to, bfc)
   }
 
   # Check if the chain file is gzipped and unzip if needed
@@ -113,3 +116,19 @@ easylift <- function(x, to, chain, bfc) {
 
   return(cur)
 }
+
+### 'from' and 'to' are single strings containing UCSC genome names, and 'bfc' is a BiocFileCache object
+.get_chain_from_BiocFileCache <- function(from, to, bfc) {
+  capTo <- paste0(toupper(substr(to, 1, 1)), substr(to, 2, nchar(to)))
+  trychainfile <- paste0(from, "To", capTo, ".over.chain")
+  q <- bfcquery(bfc, trychainfile)
+  if (nrow(q) == 0) {
+    stop(
+      "Chain file not specified and filename with ",
+      trychainfile,
+      " pattern not found in BiocFileCache default location."
+    )
+  }
+  chain <- bfc[[q$rid[1]]]
+  return(chain)
+}
diff --git a/README.Rmd b/README.Rmd
@@ -47,31 +47,39 @@ gr <- GRanges(
     end = 200000
   )
 )
+# Here, "hg19" is the source genome
 genome(gr) <- "hg19"
-to <- "hg38"
 chain <- "hg19ToHg38.over.chain.gz"
 
-easylift(gr, to, chain)
+# Here, "hg38" is the target genome
+easylift(gr, "hg38", chain)
 ```
 
-To use `BiocFileCache` for the chain file, you can add it to the cache as follows:
+### BiocFileCache
+
+To use `BiocFileCache` for the chain file, add it to the cache as follows:
 
 ```{r example2, eval = FALSE}
 chain_file <- "/path/to/your/hg19ToHg38.over.chain.gz"
 bfc <- BiocFileCache()
-bfcadd(bfc, chain_file)
+
+# Add chain file to cache if already not available
+if (nrow(bfcquery(bfc, basename(chain_file))) == 0)
+    bfcadd(bfc, chain_file)
 ```
-Then, you can use it in `easylift` like this:
+Then, use it in `easylift` like this:
 ```{r example3, eval = FALSE}
-easylift(gr, "hg38")
+easylift(gr, "hg38") 
+# or
+gr |> easylift("hg38") 
 ```
 
 ## Citation
 
 To cite package `easylift` in publications use:
 
   Al Nahid A, Love M (2023). _easylift: An R package to perform
-  genomic liftover_. R package version 0.99.7,
+  genomic liftover_. R package version 0.99.9,
   <https://github.com/nahid18/easylift>.
 
 
@@ -82,7 +90,7 @@ A BibTeX entry for LaTeX users is
     title = {easylift: An R package to perform genomic liftover},
     author = {Abdullah Al Nahid and Michael Love},
     year = {2023},
-    note = {R package version 0.99.7},
+    note = {R package version 0.99.9},
     url = {https://github.com/nahid18/easylift},
   }
 ```

diff --git a/README.md b/README.md
@@ -42,34 +42,42 @@ gr <- GRanges(
     end = 200000
   )
 )
+# Here, "hg19" is the source genome
 genome(gr) <- "hg19"
-to <- "hg38"
 chain <- "hg19ToHg38.over.chain.gz"
 
-easylift(gr, to, chain)
+# Here, "hg38" is the target genome
+easylift(gr, "hg38", chain)
 ```
 
-To use `BiocFileCache` for the chain file, you can add it to the cache
-as follows:
+### BiocFileCache
+
+To use `BiocFileCache` for the chain file, add it to the cache as
+follows:
 
 ``` r
 chain_file <- "/path/to/your/hg19ToHg38.over.chain.gz"
 bfc <- BiocFileCache()
-bfcadd(bfc, chain_file)
+
+# Add chain file to cache if already not available
+if (nrow(bfcquery(bfc, basename(chain_file))) == 0)
+    bfcadd(bfc, chain_file)
 ```
 
-Then, you can use it in `easylift` like this:
+Then, use it in `easylift` like this:
 
 ``` r
-easylift(gr, "hg38")
+easylift(gr, "hg38") 
+# or
+gr |> easylift("hg38") 
 ```
 
 ## Citation
 
 To cite package `easylift` in publications use:
 
 Al Nahid A, Love M (2023). *easylift: An R package to perform genomic
-liftover*. R package version 0.99.7,
+liftover*. R package version 0.99.9,
 <https://github.com/nahid18/easylift>.
 
 A BibTeX entry for LaTeX users is
@@ -78,7 +86,7 @@ A BibTeX entry for LaTeX users is
         title = {easylift: An R package to perform genomic liftover},
         author = {Abdullah Al Nahid and Michael Love},
         year = {2023},
-        note = {R package version 0.99.7},
+        note = {R package version 0.99.9},
         url = {https://github.com/nahid18/easylift},
       }
 

diff --git a/man/easylift.Rd b/man/easylift.Rd
diff --git a/tests/testthat/test-easylift.R b/tests/testthat/test-easylift.R
@@ -2,51 +2,52 @@ library(GenomicRanges)
 library(IRanges)
 
 test_that("easylift function tests with valid chain files", {
-
   # Test 1: Test with a valid chain gzipped file
-  chain_path_gz <- system.file("extdata", "hg19ToHg38.over.chain.gz", package = "easylift")
+  chain_path_gz <-
+    system.file("extdata", "hg19ToHg38.over.chain.gz", package = "easylift")
   gr <- GenomicRanges::GRanges(
     seqnames = "chr1",
     ranges = IRanges::IRanges(start = 100, end = 200),
     strand = "+"
   )
   genome(gr) <- "hg19"
-  expect_type(easylift(gr, to = "hg38", chain = chain_path_gz), "S4")
-  expect_no_error(easylift(gr, to = "hg38", chain = chain_path_gz))
+  expect_type(easylift(x = gr, to = "hg38", chain = chain_path_gz), "S4")
+  expect_no_error(easylift(x = gr, to = "hg38", chain = chain_path_gz))
 
   # Test 2 and 3: Test with a valid chain file
-  chain_path <- system.file("extdata", "hg19ToHg38.over.chain", package = "easylift")
+  chain_path <-
+    system.file("extdata", "hg19ToHg38.over.chain", package = "easylift")
   gr2 <- GenomicRanges::GRanges(
     seqnames = "chr2",
     ranges = IRanges::IRanges(start = 200, end = 300),
     strand = "+"
   )
   genome(gr2) <- "hg19"
-  expect_type(easylift(gr2, to = "hg38", chain = chain_path), "S4")
-  expect_no_error(easylift(gr2, to = "hg38", chain = chain_path))
+  expect_type(easylift(x = gr2, to = "hg38", chain = chain_path), "S4")
+  expect_no_error(easylift(x = gr2, to = "hg38", chain = chain_path))
 })
 
 test_that("easylift function tests with error cases", {
-
   # Test 4: Test with an empty GRanges object
   gr3 <- GenomicRanges::GRanges()
-  expect_error(easylift(gr3, to = "hg38", chain = chain_path))
+  expect_error(easylift(x = gr3, to = "hg38", chain = chain_path))
 
   # Test 5: Test with missing genome information
   gr4 <- GenomicRanges::GRanges(
     seqnames = "chr4",
     ranges = IRanges::IRanges(start = 400, end = 500),
     strand = "+"
   )
-  expect_error(easylift(gr4, to = "hg38", chain = chain_path))
+  expect_error(easylift(x = gr4, to = "hg38", chain = chain_path))
 })
 
 test_that("easylift succeeds with BiocFileCache", {
   # Load package
   library("easylift")
 
   # Create a test chain file in the temporary directory
-  chain_file <- system.file("extdata", "hg19ToHg38.over.chain.gz", package = "easylift")
+  chain_file <-
+    system.file("extdata", "hg19ToHg38.over.chain.gz", package = "easylift")
 
   # Test 6: Check if the chain file exists
   expect_true(file.exists(chain_file), "Chain file should exist.")
@@ -65,21 +66,20 @@ test_that("easylift succeeds with BiocFileCache", {
   expect_true(nrow(q) > 0, "Chain file should exist in cache.")
 
   # Create a test GRanges object
-  gr <- GenomicRanges::GRanges(
-    seqname = Rle(c("chr1", "chr2"), c(100000, 100000)),
-    ranges = IRanges::IRanges(start = 1, end = 200000)
-  )
+  gr <- GenomicRanges::GRanges(seqname = Rle(c("chr1", "chr2"), c(100000, 100000)),
+                               ranges = IRanges::IRanges(start = 1, end = 200000))
 
   genome(gr) <- "hg19"
 
-  # Perform easylift with the target assembly
+  # Perform liftover with the target genome
   tryCatch({
-    result <- easylift(x=gr, to="hg38", bfc=bfc)
+    result <- easylift(x = gr, to = "hg38", bfc = bfc)
   }, error = function(e) {
     cat("Error message:", conditionMessage(e), "\n")
     stop("easylift encountered an error.")
   })
 
   # Test 8: Check if easylift succeeded without error
-  expect_true(!is(result, "try-error"), "easylift should succeed without error.")
+  expect_true(!is(result, "try-error"),
+              "easylift should succeed without error.")
 })
diff --git a/vignettes/easylift.Rmd b/vignettes/easylift.Rmd
@@ -56,8 +56,9 @@ gr <- GRanges(
     start = 1, end = 200000
   )
 )
+# Here, "hg19" is the source genome
 genome(gr) <- "hg19"
-to <- "hg38"
+
 # Here, we use the `system.file()` function because the chain file is in the
 # package (however if you need to point to any other file on your machine,
 # just do 'chain <- "path/to/your/hg19ToHg38.over.chain.gz"'):
@@ -68,24 +69,31 @@ gr
 
 ## Run
 
+Provide the `GenomicRanges` object, target genome (e.g. `hg38`) and the chain file
+
 ```{r run}
-easylift(gr, to, chain)
+# Here, "hg38" is the target genome
+easylift(gr, "hg38", chain)
 ```
 
-## BiocFileCache
+## Run with BiocFileCache
 
-To use `BiocFileCache` for the chain file, you can add it to the cache
-as follows:
+To use `BiocFileCache` for the chain file, add it to the cache as follows:
 
 ```{r bioCache, eval=FALSE}
 chain_file <- "/path/to/your/hg19ToHg38.over.chain.gz"
 bfc <- BiocFileCache()
-bfcadd(bfc, chain_file)
+
+# Add chain file to cache if already not available
+if (nrow(bfcquery(bfc, basename(chain_file))) == 0)
+    bfcadd(bfc, chain_file)
 ```
 
-Then, you can use it in `easylift` like this:
+Then, use it in `easylift` like this:
 
 ```{r bioCache2, eval=FALSE}
+easylift(gr, "hg38") 
+# or
 gr |> easylift("hg38")
 ```