Fix reading CNV report

Boehringer-Ingelheim · Jan 26, 2024 · 8853dd7 · 8853dd7
1 parent b085b74
commit 8853dd7
Showing 1 changed file with 17 additions and 8 deletions.
diff --git a/R/personalis.R b/R/personalis.R
@@ -149,7 +149,7 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample
   row_data <- all_variants |>
     select(
       mut_id,
-      Sequence,
+      Chromosome,
       POS,
       `Variant Type`,
       `Genomic Variant`
@@ -225,14 +225,17 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality,
     guess_max = GUESS_MAX
   ) |>
     mutate(sample = sample_name) |>
-    mutate(mut_id = sprintf("%s_%s_%s", Sequence, `Genomic Variant`, `Variant Type`))
+    # in older versions, the "Chromosome" column is called "Sequence"
+    rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
+    mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`))
 
   variant_table
 }
 
 #'
 #' @importFrom tidyr pivot_longer
 #' @importFrom dplyr bind_rows
+#' @importFrom purrr keep
 #' @keywords internal
 read_personalis_somatic_variants_summary_statistics <- function(sample_folder, modality, sample_type) {
   stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
@@ -252,7 +255,9 @@ read_personalis_somatic_variants_summary_statistics <- function(sample_folder, m
     html_elements("#somatic_variant_annotation") |>
     html_elements("table") |>
     html_table(na.strings = "N/A")
-  tables[2:3] |>
+  tables |>
+    # some reports contain two such tables, some only one
+    keep(\(x) colnames(x)[1] == "Summary Small Variants") |>
     lapply(function(df) {
       colnames(df) <- make.names(colnames(df))
       colnames(df)[1] <- "metric"
@@ -296,7 +301,7 @@ read_personalis_cnv_reports <- function(sample_paths) {
 
   all_cnv <- bind_rows(map(cnv_list, "cnv_report"))
   row_data <- all_cnv |>
-    select(cnv_id, `Gene Symbol`, `Sequence`, `Segment Start`, `Segment End`) |>
+    select(cnv_id, `Gene Symbol`, `Chromosome`, `Segment Start`, `Segment End`) |>
     distinct()
   stopifnot("cnv_id is not a unique identifier" = !any(duplicated(row_data$cnv_id)))
 
@@ -339,11 +344,11 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
     "Gene Symbol" = as.character,
     "CNA Type" = as.character,
     "AbsoluteCN" = as.numeric,
-    "Sequence" = as.character,
+    "Chromosome" = as.character,
     "Segment Start" = as.numeric,
     "Segment End" = as.numeric,
-    "Estimated Sample purity" = as.numeric,
-    "Estimated Sample Ploidy" = as.numeric,
+    # "Estimated Sample purity" = as.numeric,
+    # "Estimated Sample Ploidy" = as.numeric,
     "Percent of Gene in Event" = \(x) as.numeric(sub("%", "", x))
   )
   suppressWarnings({
@@ -353,9 +358,13 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
       # we also can't specify the columns at import time, because in some personalis versions, some columns
       # are omitted.
       amp = read_excel(cnv_file, sheet = "AMP", col_types = NULL) |>
+        # In older reports the "Chromosome" column is called sequence
+        rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
         select(-any_of(c("log posterior probability", "B-allele Frequency", "Allelotype", "Mean_log2Ratio"))) |>
         mutate(across(names(COL_TYPES), \(x) COL_TYPES[[cur_column()]](x))),
       del = read_excel(cnv_file, sheet = "DEL", col_types = NULL) |>
+        # In older reports the "Chromosome" column is called sequence
+        rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
         select(-any_of(c("Wilcoxon pvalue", "KS pvalue"))) |>
         mutate(across(names(COL_TYPES), \(x) COL_TYPES[[cur_column()]](x)))
     )
@@ -368,7 +377,7 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
     cnv_table <- cnv_table |>
       mutate(sample = sample_name) |>
       # if a segment spans multiple genes, there will be multiple rows per gene
-      mutate(cnv_id = sprintf("%s_%i_%i_%s", Sequence, `Segment Start`, `Segment End`, `Gene Symbol`))
+      mutate(cnv_id = sprintf("%s_%i_%i_%s", Chromosome, `Segment Start`, `Segment End`, `Gene Symbol`))
   }
 
   cnv_table