Skip to content

Commit

Permalink
Validate Barcodes
Browse files Browse the repository at this point in the history
* Drop sanitize barcodes in favor of validating them with regexs.

* Update documentation to reflect the sanitization to validation change.

* Other assorted README updates.

---------

Co-authored-by: Dylan Webster <[email protected]>
  • Loading branch information
gmjoseph and dylanwebster committed Oct 1, 2024
1 parent e06ecc6 commit 75343f1
Show file tree
Hide file tree
Showing 10 changed files with 233 additions and 135 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Encoding: UTF-8
Depends:
R (>= 4.0.0)
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Suggests:
testthat (>= 3.0.0),
Matrix,
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(select_assay)
export(select_clusters)
export(select_projections)
export(setup)
export(validate_barcodes)
export(validate_clusters)
export(validate_count_mat)
export(validate_projections)
Expand All @@ -24,4 +25,3 @@ importFrom(utils,download.file)
importFrom(utils,packageVersion)
importFrom(utils,read.csv)
importFrom(utils,sessionInfo)
importFrom(utils,strcapture)
8 changes: 3 additions & 5 deletions R/hdf5.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,15 @@ create_hdf5 <- function(
#' @noRd
write_mat <- function(f, count_mat, feature_ids) {
features <- rownames(count_mat)
barcodes_unmodified <- colnames(count_mat)
barcodes_formatted <- sanitize_barcodes(barcodes_unmodified)
barcodes <- colnames(count_mat)
feature_count <- length(features)
barcode_count <- length(barcodes_formatted )
barcode_count <- length(barcodes)

# create groups
matrix_group <- f$create_group("matrix")
features_group <- matrix_group$create_group("features")

create_str_dataset(matrix_group, "barcodes", barcodes_formatted )
create_str_dataset(matrix_group, "barcodes_unmodified", barcodes_unmodified)
create_str_dataset(matrix_group, "barcodes", barcodes)
create_dataset(matrix_group, "data", as.integer(count_mat@x))
create_dataset(matrix_group, "indices", as.integer(count_mat@i))
create_dataset(matrix_group, "indptr", as.integer(count_mat@p))
Expand Down
59 changes: 0 additions & 59 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -250,65 +250,6 @@ cluster_levels_word_like <- function(cluster) {
})
}

#' Sanitize barcodes into expected format
#'
#' @param barcodes character vector of barcodes names
#'
#' @importFrom utils strcapture
#'
#' @return character vector of sanitized barcode names
#'
#' @noRd
sanitize_barcodes <- function(barcodes) {
if (are_barcodes_valid(barcodes)) {
return(barcodes)
}

# Some examples that we have seen
#
# Seurat Integrate will add a prefix to the barcode "12U_ACTGACTGACTG-1"
# Other users tend to add a prefix "SOMEPREFIX:ACTGACTGACTG"
pattern <-"^(.*?)(_|-|:)?([ACTG]{6,})(-\\d+)?(_|-|:)?(.*?)$"

# only santize barcodes if all match the pattern
if (length(grep(pattern, barcodes)) != length(barcodes)) {
return(barcodes)
}

# capture subgroups of pattern (prefix, barcode, suffix)
# NOTE: need to use perl regexs to support non-greedy matching
groups <- strcapture(
pattern=pattern,
x=barcodes,
perl=TRUE,
proto=list(prefix = character(),
sep1 = character(),
barcode = character(),
barcodeDashNum = character(),
sep2 = character(),
suffix = character()))

# rewrite barcodes "BARCODE-PREFIX-SUFFIX"
updated_barcodes <- character(length(barcodes))
for (i in 1:nrow(groups)) {
row <- groups[i,]

prefix <- ""
if (nchar(row$prefix) > 0) {
prefix <- sprintf("-%s", row$prefix)
}

suffix <- ""
if (nchar(row$suffix) > 0) {
suffix <- sprintf("-%s", row$suffix)
}

updated_barcodes[[i]] = sprintf("%s%s%s%s", row$barcode, row$barcodeDashNum, prefix, suffix)
}

updated_barcodes
}

#' Gets the systems OS.
#'
#' @return "windows", "mac", "unix"
Expand Down
42 changes: 31 additions & 11 deletions R/validate.R
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,12 @@ validate_count_mat <- function(count_mat, feature_ids = NULL) {
return(err("features cannot be the empty string"))
}

barcodes <- sanitize_barcodes(barcodes)

if (!are_barcodes_valid(barcodes)) {
resp <- validate_barcodes(barcodes)
if (!resp$success) {
barcode_msg <- paste(
'There is an issue with the formatting of your barcodes.',
'Barcodes should begin with base pairs and end with an optional hyphen and suffix.',
'For further information, please see the documentation: 10xgen.com/louper'
'There is an issue with the formatting of your barcodes:',
resp$msg,
'Please see the readme at github.com/10xGenomics/loupeR'
)

return(err(barcode_msg))
Expand Down Expand Up @@ -81,12 +80,33 @@ validate_count_mat <- function(count_mat, feature_ids = NULL) {
#'
#' @param barcodes a character vector
#'
#' @return A boolean true or false
#' @return A list with two elements:
#' \itemize{
#' \item success: a logical value indicating success (TRUE) or failure (FALSE)
#' \item msg: an optional error message (NULL if success is TRUE)
#' }
#'
#' @importFrom methods is
#'
#' @noRd
are_barcodes_valid <- function(barcodes) {
pattern <-"^([ACTG]{6,})(-.*?)?$"
return(all(grepl(pattern, barcodes)))
#' @export
validate_barcodes <- function(barcodes) {
barcodeRegex <- "^(.*[:_])?([ACGT]{14,})([:_].*)?$"
barcodeGemRegex <- "^(.*[:_])?([ACGT]{14,})-(\\d+)([:_].*)?$"
visiumHDRegex <- "^(.*[:_])?(s_\\d{3}um_\\d{5}_\\d{5})([:_].*)?$"
visiumHDGemRegex <- "^(.*[:_])?(s_\\d{3}um_\\d{5}_\\d{5})-(\\d+)([:_].*)?$"
xeniumCellIdRegex <- "^(.*[:_])?([a-p]{1,8})-(\\d+)([:_].*)?$"

for (barcode in barcodes) {
if (!grepl(barcodeRegex, barcode) &&
!grepl(barcodeGemRegex, barcode) &&
!grepl(visiumHDRegex, barcode) &&
!grepl(visiumHDGemRegex, barcode) &&
!grepl(xeniumCellIdRegex, barcode)) {
return(err(paste("Invalid barcode:", barcode)))
}
}

SUCCESS
}

#' Validate the seurat clusters
Expand Down
49 changes: 41 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
<a href="#troubleshooting">Troubleshooting</a>
</p>

`loupeR` creates a 10x Genomics Loupe file from a Seurat object. 10x Genomics Loupe Browser can visualize single-cell and spatial data from 10x Genomics. *Only single-cell gene expression datasets are supported*.
`loupeR` creates 10x Genomics Loupe files from Seurat objects and other 10x Genomics data in R. 10x Genomics Loupe Browser can visualize single-cell and spatial data from 10x Genomics. _Only single-cell gene expression datasets are supported in LoupeR_.

## How to Use

Expand Down Expand Up @@ -54,7 +54,7 @@ create_loupe(
)
```

Additionally, use the utility function `read_feature_ids_from_tsv` to read the Ensemble ids from the 10x dataset. A Seurat object will only have imported the feature names or ids and attached these as rownames to the count matrix. In order for the Ensemble id links to work correctly within Loupe Browser, one must manually import them and include them.
Additionally, use the utility function `read_feature_ids_from_tsv` to read the Ensemble ids from the 10x dataset. A Seurat object will only have imported the feature names or ids and attached these as rownames to the count matrix. In order for the Ensemble id links to work correctly within Loupe Browser, one must manually import them and include them.

```R
# import the library
Expand All @@ -73,7 +73,7 @@ create_loupe_from_seurat(seurat_obj, feature_ids = feature_ids)

### HDF5

Before using `loupeR`, make sure that your system has installed [HDF5](https://www.hdfgroup.org/downloads/hdf5). The HDF5 organization requires registration before being able to download the installer. Below are some other more convenient methods for installing HDF5 if you happen to have these package managers installed.
Before using `loupeR`, make sure that your system has installed [HDF5](https://www.hdfgroup.org/downloads/hdf5). The HDF5 organization requires registration before being able to download the installer. Below are some other more convenient methods for installing HDF5 if you happen to have these package managers installed.

- macOS with [Homebrew](https://brew.sh/) - `brew install hdf5` <br>
- windows with [vcpkg](https://vcpkg.io/en/index.html) - `.\vcpkg install hdf5`
Expand All @@ -95,9 +95,9 @@ install.packages(url, repos = NULL, type = "source")

### Installing loupeR using the `remotes` package

Another installation option is to use the `remotes` package to directly install `loupeR` and its dependencies. The installed package won't include the prebundled louper executable, so you must invoke the `loupeR::setup()` function which will go and download it.
Another installation option is to use the `remotes` package to directly install `loupeR` and its dependencies. The installed package won't include the prebundled louper executable, so you must invoke the `loupeR::setup()` function which will go and download it.

``` r
```r
remotes::install_github("10XGenomics/loupeR")
loupeR::setup()
```
Expand All @@ -111,14 +111,47 @@ If you are interested in automating LoupeR installation and execution (and are b
With new versions of the Loupe Browser, new version of LoupeR need to be released. The table below shows version requirements between the two.

| LoupeR Version | Loupe Browser Version |
| ------------- | ------------- |
| -------------- | --------------------- |
| v1.0.x | Loupe Browser >= 7.0 |
| v1.1.x | Loupe Browser >= 8.0 |
| v1.1.1 | Loupe Browser >= 8.0 |
| v1.1.2 | Loupe Browser >= 8.1 |

## Tutorials

* [Demo notebook](https://colab.research.google.com/github/10XGenomics/loupeR/blob/main/misc/tutorials/5k_mouse_brain.ipynb) with basic processing of an example 10x dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/10XGenomics/loupeR/blob/main/misc/tutorials/5k_mouse_brain.ipynb)
- [Demo notebook](https://colab.research.google.com/github/10XGenomics/loupeR/blob/main/misc/tutorials/5k_mouse_brain.ipynb) with basic processing of an example 10x dataset [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/10XGenomics/loupeR/blob/main/misc/tutorials/5k_mouse_brain.ipynb)

## Barcode Formatting

Barcodes must be from 10x Genomics exeriments to work with LoupeR. Valid 10x Genomics Single Cell Gene Expression barcodes have the characters ACGT repeated 16 times, followed by an optional GEM well suffix, for example:

```
AAACCCAAGAAATTGC
AAACCCAAGAAATTGC-1
```

Barcodes can also have an additional optional prefix or suffix. These optional prefixes and suffixes must be delineated either by a `:` or a `_`:

```
prefix_AAACCCAAGAAATTGC
AAACCCAAGAAATTGC_suffix
prefix_AAACCCAAGAAATTGC_suffix
prefix:AAACATACAAACAG
AAACATACAAACAG:suffix
prefix:AAACATACAAACAG:suffix
prefix_AAACCCAAGAAATTGC-1
AAACCCAAGAAATTGC-1_suffix
prefix_AAACCCAAGAAATTGC-1_suffix
prefix:AAACCCAAGAAATTGC-1
AAACCCAAGAAATTGC-1:suffix
prefix:AAACCCAAGAAATTGC-1:suffix
```

**Note**: Visium and Xenium barcodes are formatted differently. Visium and Xenium data are currently enabled for use with LoupeR, but **_not_** fully supported. Expression data for these assays can be processed by loupeR, but **_not_** image data.

See `test-validate.R` for further examples of both valid and invalid barcode formatting, as well as `validater.R` for the exact formatting requirements as code.

## Troubleshooting

Expand Down
17 changes: 17 additions & 0 deletions man/validate_barcodes.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions tests/testthat/helper.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#' Create random barcode
random_barcode <- function(size = 10) {
random_barcode <- function(size = 14) {
paste0(sample(c("A", "C", "T", "G"), size, replace=TRUE), collapse="")
}

Expand All @@ -17,7 +17,7 @@ create_count_mat <- function(rows, cols, valid_barcodes = FALSE) {
colnames <- as.character()
if (cols > 0) {
if (valid_barcodes) {
colnames <- lapply(rep(10, cols), random_barcode)
colnames <- lapply(rep(14, cols), random_barcode)
} else {
colnames <- paste0("col", 1:cols)
}
Expand Down
38 changes: 0 additions & 38 deletions tests/testthat/test-util.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,41 +95,3 @@ test_that("deduplicate_clusters prefers named factors", {
expect_length(clusters, 1)
expect_equal(clusters[[1]], cell_types)
})

test_that("sanitize_barcodes corrects barcodes", {
# no change
expect_equal(sanitize_barcodes("ACTGAA"), "ACTGAA")

# no change + lane numbers
expect_equal(sanitize_barcodes("ACTGAA-1"), "ACTGAA-1")

# prefix
expect_equal(sanitize_barcodes("prefix_ACTGAA"), "ACTGAA-prefix")
expect_equal(sanitize_barcodes("prefix-ACTGAA"), "ACTGAA-prefix")
expect_equal(sanitize_barcodes("prefix:ACTGAA"), "ACTGAA-prefix")

# barcodes with lane numbers + prefix
expect_equal(sanitize_barcodes("prefix_ACTGAA-1"), "ACTGAA-1-prefix")
expect_equal(sanitize_barcodes("prefix-ACTGAA-1"), "ACTGAA-1-prefix")
expect_equal(sanitize_barcodes("prefix:ACTGAA-1"), "ACTGAA-1-prefix")

# barcodes + prefix_with_underscore
expect_equal(sanitize_barcodes("pre_fix_ACTGAA"), "ACTGAA-pre_fix")
expect_equal(sanitize_barcodes("pre_fix-ACTGAA"), "ACTGAA-pre_fix")
expect_equal(sanitize_barcodes("pre_fix:ACTGAA"), "ACTGAA-pre_fix")

# barcodes with lane numbers + prefix_with_underscore
expect_equal(sanitize_barcodes("pre_fix_ACTGAA-1"), "ACTGAA-1-pre_fix")
expect_equal(sanitize_barcodes("pre_fix-ACTGAA-1"), "ACTGAA-1-pre_fix")
expect_equal(sanitize_barcodes("pre_fix:ACTGAA-1"), "ACTGAA-1-pre_fix")

# barcodes with prefix_with_underscore and suffix_with_underscore
expect_equal(sanitize_barcodes("pre_fix_ACTGAA-suf_fix"), "ACTGAA-pre_fix-suf_fix")
expect_equal(sanitize_barcodes("pre_fix-ACTGAA-suf_fix"), "ACTGAA-pre_fix-suf_fix")
expect_equal(sanitize_barcodes("pre_fix:ACTGAA-suf_fix"), "ACTGAA-pre_fix-suf_fix")

# barcodes with lane lane numbers with prefix_with_underscore and suffix_with_underscore
expect_equal(sanitize_barcodes("pre_fix_ACTGAA-1-suf_fix"), "ACTGAA-1-pre_fix-suf_fix")
expect_equal(sanitize_barcodes("pre_fix-ACTGAA-1-suf_fix"), "ACTGAA-1-pre_fix-suf_fix")
expect_equal(sanitize_barcodes("pre_fix:ACTGAA-1-suf_fix"), "ACTGAA-1-pre_fix-suf_fix")
})
Loading

0 comments on commit 75343f1

Please sign in to comment.