From 980e27b839828040e780d0b89d63bdc6876894fb Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Thu, 7 Mar 2024 14:10:51 +0100
Subject: [PATCH] Fix incompatibilities with more recent outputs (#3)

* Fix reading CNV report

* Roxygenize

* Fix parse copy number report

---------

Co-authored-by: grst <grst@users.noreply.github.com>
---
 DESCRIPTION    |  2 +-
 R/personalis.R | 61 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 40 insertions(+), 23 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 0f96187..324de69 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -6,7 +6,7 @@ Authors@R:
 Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects.
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.0
+RoxygenNote: 7.3.1
 Depends:
     SummarizedExperiment,
     readxl,
diff --git a/R/personalis.R b/R/personalis.R
index f97cb04..e7dc5e4 100644
--- a/R/personalis.R
+++ b/R/personalis.R
@@ -149,7 +149,7 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample
   row_data <- all_variants |>
     select(
       mut_id,
-      Sequence,
+      Chromosome,
       POS,
       `Variant Type`,
       `Genomic Variant`
@@ -225,7 +225,9 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality,
     guess_max = GUESS_MAX
   ) |>
     mutate(sample = sample_name) |>
-    mutate(mut_id = sprintf("%s_%s_%s", Sequence, `Genomic Variant`, `Variant Type`))
+    # in older versions, the "Chromosome" column is called "Sequence"
+    rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
+    mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`))
 
   variant_table
 }
@@ -233,6 +235,7 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality,
 #'
 #' @importFrom tidyr pivot_longer
 #' @importFrom dplyr bind_rows
+#' @importFrom purrr keep
 #' @keywords internal
 read_personalis_somatic_variants_summary_statistics <- function(sample_folder, modality, sample_type) {
   stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
@@ -252,7 +255,9 @@ read_personalis_somatic_variants_summary_statistics <- function(sample_folder, m
     html_elements("#somatic_variant_annotation") |>
     html_elements("table") |>
     html_table(na.strings = "N/A")
-  tables[2:3] |>
+  tables |>
+    # some reports contain two such tables, some only one
+    keep(\(x) "SNVs" %in% colnames(x)) |>
     lapply(function(df) {
       colnames(df) <- make.names(colnames(df))
       colnames(df)[1] <- "metric"
@@ -291,12 +296,14 @@ read_personalis_cnv_reports <- function(sample_paths) {
     return(NULL)
   }
 
-  col_data <- bind_rows(map(cnv_list, "summary_stats")) |>
-    tibble::column_to_rownames("sample")
+  col_data <- bind_rows(map(cnv_list, "summary_stats"))
+  if (nrow(col_data)) {
+    col_data <- col_data |> tibble::column_to_rownames("sample")
+  }
 
   all_cnv <- bind_rows(map(cnv_list, "cnv_report"))
   row_data <- all_cnv |>
-    select(cnv_id, `Gene Symbol`, `Sequence`, `Segment Start`, `Segment End`) |>
+    select(cnv_id, `Gene Symbol`, `Chromosome`, `Segment Start`, `Segment End`) |>
     distinct()
   stopifnot("cnv_id is not a unique identifier" = !any(duplicated(row_data$cnv_id)))
 
@@ -339,11 +346,11 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
     "Gene Symbol" = as.character,
     "CNA Type" = as.character,
     "AbsoluteCN" = as.numeric,
-    "Sequence" = as.character,
+    "Chromosome" = as.character,
     "Segment Start" = as.numeric,
     "Segment End" = as.numeric,
-    "Estimated Sample purity" = as.numeric,
-    "Estimated Sample Ploidy" = as.numeric,
+    # "Estimated Sample purity" = as.numeric,
+    # "Estimated Sample Ploidy" = as.numeric,
     "Percent of Gene in Event" = \(x) as.numeric(sub("%", "", x))
   )
   suppressWarnings({
@@ -353,9 +360,13 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
       # we also can't specify the columns at import time, because in some personalis versions, some columns
       # are omitted.
       amp = read_excel(cnv_file, sheet = "AMP", col_types = NULL) |>
+        # In older reports the "Chromosome" column is called sequence
+        rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
         select(-any_of(c("log posterior probability", "B-allele Frequency", "Allelotype", "Mean_log2Ratio"))) |>
         mutate(across(names(COL_TYPES), \(x) COL_TYPES[[cur_column()]](x))),
       del = read_excel(cnv_file, sheet = "DEL", col_types = NULL) |>
+        # In older reports the "Chromosome" column is called sequence
+        rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
         select(-any_of(c("Wilcoxon pvalue", "KS pvalue"))) |>
         mutate(across(names(COL_TYPES), \(x) COL_TYPES[[cur_column()]](x)))
     )
@@ -368,7 +379,7 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
     cnv_table <- cnv_table |>
       mutate(sample = sample_name) |>
       # if a segment spans multiple genes, there will be multiple rows per gene
-      mutate(cnv_id = sprintf("%s_%i_%i_%s", Sequence, `Segment Start`, `Segment End`, `Gene Symbol`))
+      mutate(cnv_id = sprintf("%s_%i_%i_%s", Chromosome, `Segment Start`, `Segment End`, `Gene Symbol`))
   }
 
   cnv_table
@@ -386,18 +397,24 @@ read_personalis_cnv_summary_statistics <- function(sample_folder) {
     sprintf("DNA_%s_dna_statistics.html", sample_name)
   )
   # unfortunately, this is not a table, but a div of divs that looks like a table
-  table <- (read_html(html_file) |> html_elements("#copy_number"))[[1]]
-  titles <- table |>
-    html_nodes(".title") |>
-    html_text()
-  values <- table |>
-    html_nodes(".value") |>
-    html_text()
-
-  cnv_metrics <- tibble(metric = titles[1:5], value = values[1:5]) |>
-    mutate(sample = sample_name) |>
-    pivot_wider(id_cols = sample, names_from = metric, values_from = value)
-  cnv_metrics
+  table <- (read_html(html_file) |> html_elements("#copy_number"))
+  # unfortunately, it seems missing in newer versions of the report
+  if (!length(table)) {
+    return(tibble())
+  } else {
+    table <- table[[1]]
+    titles <- table |>
+      html_nodes(".title") |>
+      html_text()
+    values <- table |>
+      html_nodes(".value") |>
+      html_text()
+
+    cnv_metrics <- tibble(metric = titles[1:5], value = values[1:5]) |>
+      mutate(sample = sample_name) |>
+      pivot_wider(id_cols = sample, names_from = metric, values_from = value)
+    cnv_metrics
+  }
 }