Skip to content

Commit

Permalink
Merge pull request #177 from dramanica/master
Browse files Browse the repository at this point in the history
Catch variance equal to zero when scaling
Close #52
  • Loading branch information
privefl authored Mar 24, 2024
2 parents f307d29 + f80ed10 commit 70729e9
Show file tree
Hide file tree
Showing 18 changed files with 137 additions and 36 deletions.
19 changes: 11 additions & 8 deletions .github/workflows/check-standard.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Workflow derived from https://github.com/r-lib/actions/tree/master/examples
# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
on:
push:
Expand All @@ -18,7 +18,7 @@ jobs:
fail-fast: false
matrix:
config:
- {os: macOS-latest, r: 'release'}
- {os: macos-latest, r: 'release'}
- {os: windows-latest, r: 'release'}
- {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
Expand All @@ -32,21 +32,24 @@ jobs:
_R_CHECK_LENGTH_1_LOGIC2_: verbose

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-pandoc@v1
- uses: r-lib/actions/setup-pandoc@v2

- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}
use-public-rspm: true

- uses: r-lib/actions/setup-r-dependencies@v1
- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: rcmdcheck
extra-packages: any::rcmdcheck
needs: check

- uses: r-lib/actions/check-r-package@v1
- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true

- name: Show testthat output
if: always()
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ Encoding: UTF-8
Package: bigstatsr
Type: Package
Title: Statistical Tools for Filebacked Big Matrices
Version: 1.5.13
Date: 2023-12-13
Version: 1.5.15
Date: 2024-03-24
Authors@R: c(
person("Florian", "Privé", email = "[email protected]",
role = c("aut", "cre")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ exportMethods(tcrossprod)
exportMethods(typeof)
import(foreach)
import(ggplot2)
import(rmio)
importFrom(Rcpp,sourceCpp)
importFrom(bigassertr,assert_01)
importFrom(bigassertr,assert_all)
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## bigstatsr 1.5.14

- Error when variables with a zero scaling are used in e.g. `big_randomSVD()` and `big_crossprodSelf()` (#52).

## bigstatsr 1.5.13

- Add parameter `backingfile` to `big_crossprodSelf()` and `big_cor()` (#170).
Expand Down
1 change: 1 addition & 0 deletions R/FBM.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ sub_bk <- function(path, replacement = "", stop_if_not_ext = TRUE) {
#' X[] # access as standard R matrix
#'
#' @exportClass FBM
#' @import rmio
#'
FBM_RC <- methods::setRefClass(

Expand Down
2 changes: 2 additions & 0 deletions R/crossprodSelf.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ big_crossprodSelf <- function(
tmp1 <- X[ind.row, ind.col[ind1]]

ms <- fun.scaling(X, ind.row = ind.row, ind.col = ind.col[ind1])
if (any_near0(ms$scale)) stop2(MSG_ZERO_SCALE)

mu[ind1] <- ms$center
delta[ind1] <- ms$scale
sums[ind1] <- colSums(tmp1)
Expand Down
35 changes: 20 additions & 15 deletions R/plot.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
#'
#' @examples
#' library(ggplot2)
#' qplot(y = 1:10)
#' qplot(y = 1:10) + theme_bw()
#' qplot(y = 1:10) + theme_bigstatsr()
#' (p <- ggplot(mapping = aes(x = 1:10, y = 1:10)) + geom_point())
#' p + theme_bw()
#' p + theme_bigstatsr()
theme_bigstatsr <- function(size.rel = 1) {
theme_bw() +
theme(
Expand All @@ -31,10 +31,6 @@ theme_bigstatsr <- function(size.rel = 1) {
)
}

MY_THEME <- function(p, coeff = 1) {
p + theme_bigstatsr(size.rel = coeff)
}

#' @importFrom cowplot plot_grid
#' @export
cowplot::plot_grid
Expand Down Expand Up @@ -101,7 +97,9 @@ plot.big_SVD <- function(x, type = c("screeplot", "scores", "loadings"),

assert_lengths(nval, 1)

p <- MY_THEME(qplot(y = x$d[seq_len(nval)]), coeff = coeff) +
p <- ggplot(mapping = aes(x = seq_len(nval), y = x$d[seq_len(nval)])) +
theme_bigstatsr(size.rel = coeff) +
geom_point() +
geom_line() +
scale_y_log10() +
labs(title = "Scree Plot", x = "PC Index", y = "Singular Value")
Expand Down Expand Up @@ -133,7 +131,9 @@ plot.big_SVD <- function(x, type = c("screeplot", "scores", "loadings"),
nx <- scores[1]
ny <- scores[2]

MY_THEME(qplot(x = sc[, nx], y = sc[, ny]), coeff = coeff) +
ggplot(mapping = aes(x = sc[, nx], y = sc[, ny])) +
geom_point() +
theme_bigstatsr(size.rel = coeff) +
coord_fixed() +
labs(title = "Scores of PCA", x = paste0("PC", nx), y = paste0("PC", ny))

Expand All @@ -155,7 +155,9 @@ plot.big_SVD <- function(x, type = c("screeplot", "scores", "loadings"),

} else {

p <- MY_THEME(qplot(y = x$v[, loadings]), coeff = coeff) +
p <- ggplot(mapping = aes(x = rows_along(x$v), y = x$v[, loadings])) +
geom_point() +
theme_bigstatsr(size.rel = coeff) +
labs(title = paste0("Loadings of PC", loadings),
x = "Column index", y = NULL)

Expand Down Expand Up @@ -215,17 +217,20 @@ plot.mhtest <- function(x, type = c("hist", "Manhattan", "Q-Q", "Volcano"),
type <- match.arg(type)
main <- paste(type, "plot")

if (type == "Manhattan") {
qplot(y = -lpval) +
p <- if (type == "Manhattan") {
ggplot(mapping = aes(x = seq_along(lpval), y = -lpval)) +
geom_point() +
labs(title = main, x = "Column Index",
y = expression(-log[10](italic("p-value"))))
} else if (type == "Volcano") {
qplot(x = x[["estim"]], y = -lpval) +
ggplot(mapping = aes(x = x[["estim"]], y = -lpval)) +
geom_point() +
labs(title = main, x = "Estimate",
y = expression(-log[10](italic("p-value"))))
} else if (type == "Q-Q") {
unif.ranked <- stats::ppoints(length(lpval))[rank(lpval)]
qplot(x = -log10(unif.ranked), y = -lpval) +
ggplot(mapping = aes(x = -log10(unif.ranked), y = -lpval)) +
geom_point() +
labs(title = main,
x = expression(Expected~~-log[10](italic("p-value"))),
y = expression(Observed~~-log[10](italic("p-value")))) +
Expand All @@ -239,7 +244,7 @@ plot.mhtest <- function(x, type = c("hist", "Manhattan", "Q-Q", "Volcano"),
labs(x = "p-value")
}

last_plot() + theme_bigstatsr(size.rel = coeff)
p + theme_bigstatsr(size.rel = coeff)
}

################################################################################
Expand Down
27 changes: 20 additions & 7 deletions R/randomSVD.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ svds4.par <- function(X, fun.scaling, ind.row, ind.col, k,

Ax <- FBM(n, ncores)
Atx <- FBM(m, 1)
calc <- FBM(ncores, 1, init = 0)
calc <- FBM(ncores, 1, init = -1)

cluster_type <- getOption("bigstatsr.cluster.type")
if (verbose) {
Expand All @@ -26,24 +26,32 @@ svds4.par <- function(X, fun.scaling, ind.row, ind.col, k,

if (ic == 0) { # I'm the master

# Wait for the slaves to finish computing the scalings
while (any(calc[] < 0)) {
if (any(calc[] == -2)) {
calc[] <- 3 # signal all slaves to stop
stop2(MSG_ZERO_SCALE)
} else Sys.sleep(TIME)
}

printf <- function(...) cat(sprintf(...))
it <- 0
# A
A <- function(x, args) {
printf("%d - computing A * x\n", it <<- it + 1)
Atx[] <- x
calc[] <- 1 # make them work
# Master wait for its slaves to finish working
while (sum(calc[]) > 0) Sys.sleep(TIME)
# the master wait for its slaves to finish working
while (any(calc[] > 0)) Sys.sleep(TIME)
rowSums(Ax[])
}
# Atrans
Atrans <- function(x, args) {
printf("%d - computing At * x\n", it <<- it + 1)
Ax[, 1] <- x
calc[] <- 2 # make them work
# Master wait for its slaves to finish working
while (sum(calc[]) > 0) Sys.sleep(TIME)
# the master wait for its slaves to finish working
while (any(calc[] > 0)) Sys.sleep(TIME)
Atx[]
}

Expand All @@ -61,6 +69,12 @@ svds4.par <- function(X, fun.scaling, ind.row, ind.col, k,

# Scaling
ms <- fun.scaling(X, ind.row = ind.row, ind.col = ind.col.part)
if (any_near0(ms$scale)) {
calc[ic] <- -2 # signal the master there is an issue with the scaling
return(ms)
} else {
calc[ic] <- 0
}

repeat {
# Slaves wait for their master to give them orders
Expand All @@ -78,8 +92,6 @@ svds4.par <- function(X, fun.scaling, ind.row, ind.col, k,
} else if (c == 3) {
# End
break
} else {
stop("RandomSVD: unclear order from the master.")
}
calc[ic] <- 0
}
Expand Down Expand Up @@ -110,6 +122,7 @@ svds4.seq <- function(X, fun.scaling, ind.row, ind.col, k, tol, verbose,

# scaling
ms <- fun.scaling(X, ind.row, ind.col)
if (any_near0(ms$scale)) stop2(MSG_ZERO_SCALE)

printf <- function(...) if (verbose) cat(sprintf(...))
it <- 0
Expand Down
2 changes: 2 additions & 0 deletions R/scaling.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ big_scale <- function(center = TRUE, scale = TRUE) {
sds <- rep(1, m)
}

if (any_near0(sds)) warning2(MSG_ZERO_SCALE)

data.frame(center = means, scale = sds)
}
}
Expand Down
2 changes: 2 additions & 0 deletions R/tcrossprodSelf.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ big_tcrossprodSelf <- function(
ind <- seq2(intervals[j, ])
ind.col.ind <- ind.col[ind]
ms <- fun.scaling(X, ind.row = ind.row, ind.col = ind.col.ind)
if (any_near0(ms$scale)) stop2(MSG_ZERO_SCALE)

means[ind] <- ms$center
sds[ind] <- ms$scale
increment_scaled_tcrossprod(K, X_part_temp, X,
Expand Down
11 changes: 11 additions & 0 deletions R/utils-assert.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@

################################################################################

MSG_ZERO_SCALE <-
"Some variables have zero scaling; remove them before attempting to scale."

################################################################################

any_near0 <- function(x) {
any(dplyr::near(x, 0))
}

################################################################################

as_vec <- function(x) {
x2 <- drop(x)
if (is.matrix(x2))
Expand Down
6 changes: 5 additions & 1 deletion docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions inst/WORDLIST
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@ bty
cbind
cdot
CMSA
Codecov
confounders
Crossprod
crossprods
didn
dir
doesn
doi
eigen
Expand Down
6 changes: 3 additions & 3 deletions man/theme_bigstatsr.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions tests/testthat/helper-init.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ diffPCs <- function(test, rot) {

################################################################################

custom_scaling <- function(X, ind.row = rows_along(X),
ind.col = cols_along(X), ncores = 1) {
stats <- big_colstats(X, ind.row, ind.col, ncores = ncores)
data.frame(center = stats$sum / length(ind.row), scale = sqrt(stats$var))
}

################################################################################

set.seed(NULL)
if (not_cran) {
# Seeds that won't work (because of bad luck)
Expand Down
12 changes: 12 additions & 0 deletions tests/testthat/test-crossprodSelf.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,15 @@ test_that("equality with crossprod with half of the data", {
})

################################################################################

test_that("we catch zero variance variables when scaling", {

X <- FBM(20, 20, init = rnorm(400))
expect_no_error(big_crossprodSelf(X, fun.scaling = custom_scaling))

X[, 1] <- 0
expect_error(big_crossprodSelf(X, fun.scaling = custom_scaling),
"Some variables have zero scaling")
})

################################################################################
19 changes: 19 additions & 0 deletions tests/testthat/test-randomSVD.R
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,22 @@ test_that("as_scaling_fun() works", {
})

################################################################################

test_that("zero variance is caught", {

X <- FBM(20, 20, init = rnorm(400))
expect_no_error(big_randomSVD(X, big_scale()))

X[, 1] <- 0 # set a variable to have zero variance
# this is also catched as a warning in big_scale()
expect_error(expect_warning(big_randomSVD(X, big_scale()),
"Some variables have zero scaling"),
"Some variables have zero scaling")

expect_error(big_randomSVD(X, custom_scaling),
"Some variables have zero scaling")
expect_error(big_randomSVD(X, custom_scaling, ncores = test_cores()),
"Some variables have zero scaling")
})

################################################################################
Loading

0 comments on commit 70729e9

Please sign in to comment.