updated fst

thierrygosselin · Mar 15, 2023 · 2eb1c49 · 2eb1c49
1 parent 2d5cc95
commit 2eb1c49
Show file tree

Hide file tree

Showing 64 changed files with 2,195 additions and 1,568 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: assigner
 Type: Package
 Title: Assignment Analysis with RADseq Data
-Version: 0.5.8
-Date: 2020-10-27
+Version: 0.5.9
+Date: 2023-03-15
 Encoding: UTF-8
 Authors@R: c(
   person("Thierry", "Gosselin", email = "[email protected]", role = c("aut", "cre")),
@@ -19,9 +19,7 @@ Imports:
     dplyr (>= 1.0.0),
     furrr (>= 0.2.0),
     magrittr,
-    radiator,
-    tidyverse,
-    vroom
+    radiator
 Suggests:
     BiocManager,
     carrier,
@@ -42,13 +40,15 @@ Suggests:
     tibble,
     tidyr,
     tidyselect,
+    tidyverse,
+    vroom,
     utils
 Remotes: thierrygosselin/radiator
 License: GPL-3
 LazyLoad: yes
 LazyData: true
 VignetteBuilder: 
     knitr
-RoxygenNote: 7.1.1
+RoxygenNote: 7.2.3
 URL: http://thierrygosselin.github.io/assigner/
 BugReports: https://github.com/thierrygosselin/assigner/issues
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,8 @@
+# assigner 0.5.9 2023-03-15
+
+* work on *assigner::fst_WC84*
+
+
 # assigner 0.5.8 2020-10-20
 
 * Version bump because it updates numerous packages: `tidyr`, `readr`, using `future`, `carrier`.

diff --git a/R/fst_NEI87.R b/R/fst_NEI87.R
@@ -156,13 +156,13 @@
 #' \strong{Wide format:}
 #' The wide format cannot store metadata info.
 #' The wide format starts with these 2 id columns:
-#' \code{INDIVIDUALS}, \code{POP_ID} (that refers to any grouping of individuals),
+#' \code{INDIVIDUALS}, \code{STRATA} (that refers to any grouping of individuals),
 #' the remaining columns are the markers in separate columns storing genotypes.
 #'
 #' \strong{Long/Tidy format:}
 #' The long format is considered to be a tidy data frame and can store metadata info.
 #' (e.g. from a VCF see \pkg{radiator} \code{\link{tidy_genomic_data}}). A minimum of 4 columns
-#' are required in the long format: \code{INDIVIDUALS}, \code{POP_ID},
+#' are required in the long format: \code{INDIVIDUALS}, \code{STRATA},
 #' \code{MARKERS or LOCUS} and \code{GENOTYPE or GT}. The rest are considered metata info.
 #'
 #' \strong{2 genotypes formats are available:}
@@ -290,21 +290,16 @@ fst_NEI87 <- function(
       # message("strata file: yes")
       number.columns.strata <- max(utils::count.fields(strata, sep = "\t"))
       col.types <- stringi::stri_join(rep("c", number.columns.strata), collapse = "")
-      suppressMessages(strata.df <- readr::read_tsv(file = strata, col_names = TRUE, col_types = col.types) %>%
-                         dplyr::rename(POP_ID = STRATA))
+      suppressMessages(
+        strata.df <- readr::read_tsv(file = strata, col_names = TRUE, col_types = col.types)
+      )
     } else {
       # message("strata object: yes")
-      colnames(strata) <- stringi::stri_replace_all_fixed(
-        str = colnames(strata),
-        pattern = "STRATA",
-        replacement = "POP_ID",
-        vectorize_all = FALSE
-      )
       strata.df <- strata
     }
 
     # Remove potential whitespace in pop_id
-    strata.df$POP_ID <- stringi::stri_replace_all_fixed(strata.df$POP_ID, pattern = " ", replacement = "_", vectorize_all = FALSE)
+    strata.df$STRATA <- stringi::stri_replace_all_fixed(strata.df$STRATA, pattern = " ", replacement = "_", vectorize_all = FALSE)
 
     strata.df$INDIVIDUALS <- stringi::stri_replace_all_fixed(
       str = strata.df$INDIVIDUALS,
@@ -314,7 +309,7 @@ fst_NEI87 <- function(
     )
 
     input <- input %>%
-      dplyr::select(-POP_ID) %>%
+      dplyr::select(-STRATA) %>%
       dplyr::left_join(strata.df, by = "INDIVIDUALS")
   }
 
@@ -323,14 +318,14 @@ fst_NEI87 <- function(
 
   # subsampling data------------------------------------------------------------
   # create the subsampling list
-  ind.pop.df <- dplyr::distinct(.data = input, POP_ID, INDIVIDUALS)
+  ind.pop.df <- dplyr::distinct(.data = input, STRATA, INDIVIDUALS)
 
   if (is.null(subsample)) {
     iteration.subsample <- 1
   } else {
     if (subsample == "min") {
       subsample <- ind.pop.df %>%
-        dplyr::group_by(POP_ID) %>%
+        dplyr::group_by(STRATA) %>%
         dplyr::tally(.) %>%
         dplyr::filter(n == min(n)) %>%
         dplyr::ungroup(.) %>%
@@ -647,11 +642,11 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
       A2 = stringi::stri_sub(GT, 4,6)
     ) %>%
     dplyr::select(-GT) %>%
-    assigner::rad_long(x = ., cols = c("MARKERS", "INDIVIDUALS", "POP_ID"), names_to = "ALLELES", values_to = "GT")
+    assigner::rad_long(x = ., cols = c("MARKERS", "INDIVIDUALS", "STRATA"), names_to = "ALLELES", values_to = "GT")
 
   # frequency per markers, alleles, pop
   p <- x %>%
-    dplyr::group_by(MARKERS, POP_ID) %>%
+    dplyr::group_by(MARKERS, STRATA) %>%
     dplyr::count(GT) %>%
     dplyr::mutate(P = n / sum(n)) %>%
     dplyr::select(-n) %>%
@@ -660,7 +655,7 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
   # mp: mean frequency per markers
   # mp2: sum of square mean frequency per markers
   mean.p2 <- p %>%
-    tidyr::complete(data = ., POP_ID, tidyr::nesting(MARKERS, GT), fill = list(P = 0)) %>%
+    tidyr::complete(data = ., STRATA, tidyr::nesting(MARKERS, GT), fill = list(P = 0)) %>%
     dplyr::group_by(MARKERS, GT) %>%
     dplyr::summarise(MP = mean(P, na.rm = TRUE)) %>%
     dplyr::group_by(MARKERS) %>%
@@ -669,7 +664,7 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
 
   # msp2 mean frequency per markers per pop
   mean.frequency.markers <- p %>%
-    dplyr::group_by(MARKERS, POP_ID) %>%
+    dplyr::group_by(MARKERS, STRATA) %>%
     dplyr::summarise(SP2 = sum(P^2)) %>%
     dplyr::group_by(MARKERS) %>%
     dplyr::summarise(MSP2 = mean(SP2, na.rm = TRUE)) %>%
@@ -679,11 +674,11 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
   # Mean heterozygosity observed per pop and markers
   # mean heterozygosity across all markers
   mean.het.obs.markers <- x %>%
-    dplyr::distinct(MARKERS, POP_ID, INDIVIDUALS, GT) %>%
-    dplyr::group_by(MARKERS, POP_ID, INDIVIDUALS) %>%
+    dplyr::distinct(MARKERS, STRATA, INDIVIDUALS, GT) %>%
+    dplyr::group_by(MARKERS, STRATA, INDIVIDUALS) %>%
     dplyr::tally(.) %>%
     dplyr::mutate(HO = n - 1) %>%
-    dplyr::group_by(POP_ID, MARKERS) %>%
+    dplyr::group_by(STRATA, MARKERS) %>%
     dplyr::summarise(HO = mean(HO)) %>%
     dplyr::group_by(MARKERS) %>%
     dplyr::summarise(HO = mean(HO)) %>%
@@ -692,7 +687,7 @@ compute_fst_nei <- function(x, ci = FALSE, iteration.ci = 100, quantiles.ci = c(
   # mn: corrected mean number of individuals per markers
   #n: number of individuals, per pop and markers
   fst.data <- x %>%
-    dplyr::group_by(POP_ID, MARKERS) %>%
+    dplyr::group_by(STRATA, MARKERS) %>%
     dplyr::distinct(INDIVIDUALS) %>%
     dplyr::tally(.) %>%
     dplyr::mutate(N_INV = 1 / n) %>%
@@ -853,8 +848,8 @@ pairwise_fst_nei <- carrier::crate(function(
   # list.pair <- 2
   pop.select <- stringi::stri_join(purrr::flatten(pop.pairwise[list.pair]))
   data.select <- data.genotyped %>%
-    dplyr::filter(POP_ID %in% pop.select) %>%
-    dplyr::mutate(POP_ID = droplevels(x = POP_ID))
+    dplyr::filter(STRATA %in% pop.select) %>%
+    dplyr::mutate(STRATA = droplevels(x = STRATA))
   fst.select <- assigner::compute_fst_nei(x = data.select, ci = ci, iteration.ci = iteration.ci, quantiles.ci = quantiles.ci, digits = digits)
   # if (ci){
   df.select <- tibble::tibble(POP1 = pop.select[1], POP2 = pop.select[2])
@@ -895,7 +890,7 @@ fst_nei_subsample <- function(
   }
 
   # Keep only the subsample
-  input <- dplyr::semi_join(input, x, by = c("POP_ID", "INDIVIDUALS"))
+  input <- dplyr::semi_join(input, x, by = c("STRATA", "INDIVIDUALS"))
   x <- NULL #unused object
 
   # genotyped data and holdout sample  -----------------------------------------
@@ -915,7 +910,7 @@ fst_nei_subsample <- function(
   # Compute pairwise Fst -------------------------------------------------------
   if (pairwise) {
     if (verbose) message("Paiwise fst calculation")
-    pop.list <- levels(input$POP_ID) # pop list
+    pop.list <- levels(input$STRATA) # pop list
     # all combination of populations
     pop.pairwise <- utils::combn(unique(pop.list), 2, simplify = FALSE)
     # Fst for all pairwise populations

diff --git a/R/fst_WC84.R b/R/fst_WC84.R
@@ -381,7 +381,7 @@ fst_WC84 <- function(
     dplyr::arrange(POP_ID)
 
   # strip the data -------------------------------------------------------------
-  strata.bk <- markers.meta.bk <- genotypes.meta.bk <- NULL
+  # increases speed for large datasets
   env.arg <- rlang::current_env()
   data %<>%
     radiator::strip_rad(
@@ -417,7 +417,8 @@ fst_WC84 <- function(
     .x = 1:iteration.subsample,
     .f = subsampling_data,
     strata = strata.bk,
-    subsample = subsample
+    subsample = subsample,
+    random.seed = NULL
   )
 
   # keep track of subsampling individuals and write to directory
@@ -488,7 +489,7 @@ fst_WC84 <- function(
 
     # test1 <- res$pairwise.fst
     # test2 <- res$pairwise.fst.upper.matrix
-
+    # test3 <- res$pairwise.fst.full.matrix
 
     # pairwise.fst.upper.matrix
     # pairwise.fst.full.matrix
@@ -1518,11 +1519,6 @@ heatmap_fst <- function(
   if (is.null(pop.levels)) {
     pop.levels <- colnames(data.fst)
   }
-  # else {
-  # if (length(pop.levels) != length(union(rownames(data.fst), colnames(data.fst)))) {
-  # rlang::abort(message = "Contact author, problem with strata levels in heatmap")
-  # }
-  # }
 
   data.fst %<>%
     radiator::distance2tibble(
@@ -1536,7 +1532,6 @@ heatmap_fst <- function(
     magrittr::set_colnames(x = ., value = c("POP1", "POP2", "FST"))
 
   inv.levels <- rev(levels(data.fst$POP2))
-  # pop.levels <- levels(data.fst$POP2)
   rounder <- function(x, digits) round(as.numeric(x), digits)
   if (max(stringi::stri_length(data.fst$FST), na.rm = TRUE) != digits) {
     round.num <- TRUE
@@ -1560,7 +1555,6 @@ heatmap_fst <- function(
 
 
   data.fst %<>% dplyr::left_join(data.ci, by = c("POP1", "POP2"))
-  # median.fst <- median(x = data.fst$FST, na.rm = TRUE)
   mean.fst <- mean(x = data.fst$FST, na.rm = TRUE)
   min.fst <- min(x = data.fst$FST, na.rm = TRUE)
   max.fst <- max(x = data.fst$FST, na.rm = TRUE)
@@ -1606,28 +1600,31 @@ heatmap_fst <- function(
   heatmap.fst <- ggplot2::ggplot(
     data = data.fst, ggplot2::aes(x = POP1, y = POP2, fill = FST)
   ) +
-    ggplot2::geom_tile(color = "white") +
+    ggplot2::geom_tile(color = "white", alpha = 0.7) +
     ggplot2::geom_text(ggplot2::aes(x = POP1, y = POP2, label = CI),
                        color = "black", size = text.size, na.rm = TRUE) +
-    ggplot2::scale_fill_gradient2(
-      low = color.low,
-      mid = color.mid,
-      high = color.high,
-      # midpoint = median.fst,
-      midpoint = mean.fst,
-      na.value = "white",
-      limit = NULL,
-      # limit = c(min.fst, max.fst),
-      space = "Lab"
-    ) +
+    ggplot2::scale_fill_viridis_c(name = "Fst", option = "H") +
+    # ggplot2::scale_fill_gradient2(
+    #   low = color.low,
+    #   mid = color.mid,
+    #   high = color.high,
+    #   # midpoint = median.fst,
+    #   midpoint = mean.fst,
+    #   na.value = "white",
+    #   limit = NULL,
+    #   # limit = c(min.fst, max.fst),
+    #   space = "Lab"
+    # ) +
     ggplot2::scale_x_discrete(position = "top") +
     ggplot2::theme_minimal() +
     ggplot2::theme(
       axis.title.x = ggplot2::element_blank(),
       axis.title.y = ggplot2::element_blank(),
       axis.text.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
       axis.text.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold")
-    )
+    ) +
+    ggplot2::coord_equal()
+
   print(heatmap.fst)
 
   if (!is.null(filename)) {
@@ -1750,6 +1747,7 @@ assigner_fst_stats <- function(
 #' @keywords internal
 
 fst_stats <- function(x, l, digits = 9L, m = NULL, s = NULL, subsample = FALSE) {
+  # x = "pairwise.fst" # test
   res <- list()
   # message(x)
   want <- c("sigma.loc", "fst.markers", "fst.ranked", "fst.overall",
@@ -1868,26 +1866,41 @@ change_matrix_strata <- function(x, s) {
   # x
   # class(colnames(x))
   # class(rownames(x))
+
   if (length(dim(x)) > 1) {
-    colnames(x) %<>%
-      as.character(.) %>%
-      stringi::stri_replace_all_regex(
-        str = .,
-        pattern = paste0("^", as.character(s$STRATA_SEQ)),
-        replacement = as.character(s$POP_ID),
-        vectorize_all = FALSE
-      )
-    # colnames(x)
-    rownames(x) %<>%
-      as.character(.) %>%
-      stringi::stri_replace_all_regex(
-        str = .,
-        pattern = paste0("^", as.character(s$STRATA_SEQ)),
-        replacement = as.character(s$POP_ID),
-        vectorize_all = FALSE
-      )
+    if (identical(colnames(x), as.character(s$STRATA_SEQ))) {
+      colnames(x) <- as.character(s$POP_ID)
+    } else {
+      rlang::abort(message = "Contact author, problem with strata levels in fst")
+    }
+    if (identical(rownames(x), as.character(s$STRATA_SEQ))) {
+      rownames(x) <- as.character(s$POP_ID)
+    } else {
+      rlang::abort(message = "Contact author, problem with strata levels in fst")
+    }
   }
-  #
+
+  # problem with code below 20221026
+  # if (length(dim(x)) > 1) {
+  #   colnames(x) %<>%
+  #     as.character(.) %>%
+  #     stringi::stri_replace_all_regex(
+  #       str = .,
+  #       pattern = paste0("^", as.character(s$STRATA_SEQ)),
+  #       replacement = as.character(s$POP_ID),
+  #       vectorize_all = FALSE
+  #     )
+  #   # colnames(x)
+  #   rownames(x) %<>%
+  #     as.character(.) %>%
+  #     stringi::stri_replace_all_regex(
+  #       str = .,
+  #       pattern = paste0("^", as.character(s$STRATA_SEQ)),
+  #       replacement = as.character(s$POP_ID),
+  #       vectorize_all = FALSE
+  #     )
+  # }
+
   return(x)
 }#change_matrix_strata
 

diff --git a/R/global_variables.R b/R/global_variables.R
@@ -41,7 +41,8 @@ if (getRversion() >= "2.15.1") {
       "whitelist.markers", "random.seed", "folder", ".data", "MARKERS...1",
       "MARKERS...3", "MARKERS...4", "STRATA_SEQ", "ARGUMENTS", "VALUES",
       "input", "HET", "calibrate.alleles", "iteration.subsample", "n",
-      "M_SEQ", "ID_SEQ", "1", "2", "where", "p", "forking"
+      "M_SEQ", "ID_SEQ", "1", "2", "where", "p", "forking", "strata.bk",
+      "markers.meta.bk", "genotypes.meta.bk"
     )
   )
 }