From 9aa2a50ea8ab2bc02d02680e1bf733816d30a624 Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Thu, 7 Mar 2024 14:16:03 +0100
Subject: [PATCH 1/9] Vcf (#5)

* Initial commit

* Update DESCRIPTION

* bootstrap tests

* Test CI

* Prototype function to load gene expression data to SE

* Stub functions

* Add progress bar

* Add pre-commit config

* Add personalis functions to package and run roxygen

* Add relevant documentation on personalis outputs

* Update exploratory notebook

* Fix pre-commit prettier

* Format files according to pre-commit checks

* Draft function to read small variant data

* configure lintr

* Implement basic functionality to read variant and GEX data into MAE

* Implement basic error handling for missing samples

* Generalize warning for missing samplese

* Read CNV data

* Fix CNV IO function in case of empty tables

* Add function to read personalis HLA data

* Add function to read in TCR data

* Scrape TCR summary statistics from HTML

* Implement function to read somatic variant statistics

* Read summary stats for somatic variants

* Implement bumpy_matrix_to_df

* Read CNV summary statistics

* Read MSI info

* refactor

* Workaround for samples with no col in bumpy matrix

* Apply the fix also to small variant data

* Use "Genomic Variant" instead of pos as unique variant identifier

* Fix issue with reading non-somatic variants

* Handle case when there are no samples for a modality

* Fix duplicated mutation ids

* Fix column name incompatibility in newer HTML report versions

* stub vignette

* Add vignette

* Update vignette

* Ensure bumpy matrix, row and coldata have consistent order

* Fix alternative gex filename and CNV import

* Support alternative TCR path

* Fix column conversion in CNV reader

* Fix paths

* add function for parsing VCF files

* add functionality for reading and storing VCF data

* add/change comments

* add option to read small variant reports of type all

* Angewendeter Vorschlag

* Angewendeter Vorschlag

* add sample type check

* Angewendeter Vorschlag

* Angewendeter Vorschlag

* add report_type parameter

* Update README

* Fix reading CNV report

* Roxygenize

* Fix parse copy number report

---------

Co-authored-by: Christopher Mohr <christopher.mohr@boehringer-ingelheim.com>
Co-authored-by: grst <grst@users.noreply.github.com>
---
 R/personalis.R | 278 ++++++++++++++++++++++++++++++++++++++++++-------
 R/util.R       |  28 +++++
 2 files changed, 268 insertions(+), 38 deletions(-)

diff --git a/R/personalis.R b/R/personalis.R
index f97cb04..9ba2742 100644
--- a/R/personalis.R
+++ b/R/personalis.R
@@ -3,30 +3,51 @@
 GUESS_MAX <- 21474836 # largest value without warning - this is anyway more than an excel sheet can have rows
 
 #' Read a set of personalis results folders into a MultiAssayExperiment object
-#' @param pathlist List with paths to Personalis results folders
 #' @importFrom MultiAssayExperiment MultiAssayExperiment
+#' @param pathlist List with paths to Personalis results folders
+#' @param report_type Type of small variant report that should be read (default: preferred)
+#' @param read_vcf_data Boolean to specify if VCF file should be read in as well (default: FALSE)
 #' @return MultiAssayExperiment
 #' @export
-read_personalis <- function(pathlist) {
+read_personalis <- function(pathlist, report_type="preferred", read_vcf_data=FALSE) {
+  stopifnot("`report_type` must be one of 'preferred' or 'all'." = report_type %in% c("preferred", "all"))
+  
   experiments <- list()
   message(">>> Reading gene expression...")
   experiments[["gene_expression"]] <- read_personalis_gene_expression(pathlist)
 
   message(">>> Reading DNA small variant data... ")
   message("Reading DNA small variant data (somatic)... ")
-  experiments[["dna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "somatic")
+  experiments[["dna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "somatic", report_type)
   message("Reading DNA small variant data (tumor)... ")
-  experiments[["dna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "tumor")
+  experiments[["dna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "tumor", report_type)
   message("Reading DNA small variant data (normal)... ")
-  experiments[["dna_small_variants_normalr"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "normal")
-
+  experiments[["dna_small_variants_normal"]] <- read_personalis_small_variant_reports(pathlist, "DNA", "normal", report_type)
+
+  if (read_vcf_data) {
+    message("Reading DNA small variant data (somatic) from VCF files... ")
+    experiments[["dna_small_variants_somatic_vcf"]] <- read_personalis_vcf_files(pathlist, "DNA", "somatic")
+    message("Reading DNA small variant data (tumor) from VCF files... ")
+    experiments[["dna_small_variants_tumor_vcf"]] <- read_personalis_vcf_files(pathlist, "DNA", "tumor")
+    message("Reading DNA small variant data (normal) from VCF files... ")
+    experiments[["dna_small_variants_normal_vcf"]] <- read_personalis_vcf_files(pathlist, "DNA", "normal")
+  }
+  
   message(">>> Reading RNA small variant data... ")
   message("Reading RNA small variant data (somatic)... ")
-  experiments[["rna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "somatic")
+  experiments[["rna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "somatic", report_type)
   message("Reading RNA small variant data (tumor)... ")
-  experiments[["rna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "tumor")
+  experiments[["rna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "tumor", report_type)
   message("Reading RNA small variant data (normal)... ")
-  experiments[["rna_small_variants_normal"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "normal")
+  experiments[["rna_small_variants_normal"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "normal", report_type)
+  
+  # for RNA we only have tumor or somatic variant VCF data
+  if (read_vcf_data) {
+    message("Reading RNA small variant data (somatic) from VCF files... ")
+    experiments[["rna_small_variants_somatic_vcf"]] <- read_personalis_vcf_files(pathlist, "RNA", "somatic")
+    message("Reading RNA small variant data (tumor) from VCF files... ")
+    experiments[["rna_small_variants_tumor_vcf"]] <- read_personalis_vcf_files(pathlist, "RNA", "tumor")
+  }
 
   message(">>> Reading microsatellite stability (MSI) data... ")
   experiments[["msi"]] <- read_personalis_msi_reports(pathlist)
@@ -118,11 +139,12 @@ read_personalis_gene_expression_sample <- function(sample_folder) {
 #'   'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
 #'   'somatic' refers to tumor vs. normal (i.e. somatic mutations only).
 #' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA'
+#' @param report_type report_type which should be read. Can be either 'preferred' or 'all'
 #' @return SummarizedExperiment
 #' @importFrom dplyr select distinct bind_rows
 #' @importFrom purrr map
 #' @export
-read_personalis_small_variant_reports <- function(sample_paths, modality, sample_type) {
+read_personalis_small_variant_reports <- function(sample_paths, modality, sample_type, report_type) {
   read_all <- function(sample, ...) {
     list(
       variant_report = read_personalis_small_variant_report_sample(sample, ...),
@@ -134,7 +156,8 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample
     read_all,
     sprintf("%s small variant (%s)", modality, sample_type),
     modality = modality,
-    sample_type = sample_type
+    sample_type = sample_type,
+    report_type = report_type
   )
 
   if (!length(variant_list)) {
@@ -149,7 +172,7 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample
   row_data <- all_variants |>
     select(
       mut_id,
-      Sequence,
+      Chromosome,
       POS,
       `Variant Type`,
       `Genomic Variant`
@@ -194,23 +217,27 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample
 #' We also do not read the `cancer_clinical_research`, the `cancer_research` and the `lowpupulationfreq` reports,
 #' because they are subsets of the full report.
 #'
-#' In addition, Personalis also provides raw VCF files with all (unfiltered) variants. We currently don't
-#' read them in because without additional filtering they are not very useful. If you need this level of information,
-#' feel free to start from the raw vcf files or even run your own variant calling based on the FASTQ files.
+#' In addition, Personalis also provides raw VCF files with all (unfiltered) variants. If you need this level of information,
+#' please specify read_vcf_data=TRUE in your read_personalis() call.
 #'
 #' @param sample_folder path to personalis folder
-#' @param sample_types Can be ome or multiple of of 'tumor', 'normal', or 'somatic'.
+#' @param sample_types Can be one or multiple of of 'tumor', 'normal', or 'somatic'.
 #'   'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
 #'   'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
 #'   'somatic' refers to tumor vs. normal (i.e. somatic mutations only).
 #' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA'
+#' @param report_type report_type which should be read. Can be either 'preferred' or 'all'
 #' @return data frame with all variants
 #' @importFrom dplyr mutate
 #' @keywords internal
-read_personalis_small_variant_report_sample <- function(sample_folder, modality, sample_type) {
+read_personalis_small_variant_report_sample <- function(sample_folder, modality, sample_type, report_type) {
   stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
   stopifnot("`sample_types` must be in 'tumor', 'normal', or 'somatic'." = sample_type %in% c("tumor", "normal", "somatic"))
+  stopifnot("`report_type` must be one of 'preferred' or 'full'." = report_type %in% c("preferred", "all"))
   sample_name <- basename(sample_folder)
+  # determine report folder based on parameter
+  report_folder <- ifelse(report_type == "preferred", "Preferred_Transcripts", "All_Transcripts")
+  
   # the prefix of the normal filename ends with N rather than T, e.g. K13T -> tumor, K13N -> normal
   # even though they are all in the K13T folder.
   tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name)
@@ -219,13 +246,15 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality,
       sample_folder,
       sprintf("%s_Pipeline", modality),
       "Annotated_SmallVariant_Reports",
-      "Preferred_Transcripts",
-      sprintf("%s_%s_%s_%s_small_variant_report_preferred.xlsx", modality, tmp_sample_name, sample_type, tolower(modality))
+      report_folder,
+      sprintf("%s_%s_%s_%s_small_variant_report_%s.xlsx", modality, tmp_sample_name, sample_type, tolower(modality), report_type)
     ),
     guess_max = GUESS_MAX
   ) |>
     mutate(sample = sample_name) |>
-    mutate(mut_id = sprintf("%s_%s_%s", Sequence, `Genomic Variant`, `Variant Type`))
+    # in older versions, the "Chromosome" column is called "Sequence"
+    rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
+    mutate(mut_id = sprintf("%s_%s_%s", Chromosome, `Genomic Variant`, `Variant Type`))
 
   variant_table
 }
@@ -233,8 +262,9 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality,
 #'
 #' @importFrom tidyr pivot_longer
 #' @importFrom dplyr bind_rows
+#' @importFrom purrr keep
 #' @keywords internal
-read_personalis_somatic_variants_summary_statistics <- function(sample_folder, modality, sample_type) {
+read_personalis_somatic_variants_summary_statistics <- function(sample_folder, modality, sample_type, report_type) {
   stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
   sample_name <- basename(sample_folder)
 
@@ -252,7 +282,9 @@ read_personalis_somatic_variants_summary_statistics <- function(sample_folder, m
     html_elements("#somatic_variant_annotation") |>
     html_elements("table") |>
     html_table(na.strings = "N/A")
-  tables[2:3] |>
+  tables |>
+    # some reports contain two such tables, some only one
+    keep(\(x) "SNVs" %in% colnames(x)) |>
     lapply(function(df) {
       colnames(df) <- make.names(colnames(df))
       colnames(df)[1] <- "metric"
@@ -267,6 +299,158 @@ read_personalis_somatic_variants_summary_statistics <- function(sample_folder, m
     pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value")
 }
 
+#' Read HTML report on variant calling summary statistics and store information
+#' based on given modality and sample_type:
+#' DNA -> Normal DNA or Tumor DNA "Summary Small Variants" table
+#' RNA -> Tumor RNA "Summary Small Variants" table
+#' somatic -> "Tumor/Normal Concordance" table
+#'
+#' @param sample_folder path to personalis folder
+#' @param sample_type Can be one or multiple of of 'tumor', 'normal', or 'somatic'.
+#'   'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
+#'   'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
+#'   'somatic' refers to tumor vs. normal (i.e. somatic mutations only).
+#' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA'
+#' @importFrom rvest read_html html_elements html_nodes html_text
+#' @importFrom tidyr pivot_longer
+#' @importFrom stringr str_to_title
+read_personalis_variant_calling_summary_statistics <- function(sample_folder, modality, sample_type) {
+  stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
+  stopifnot("`sample_type` must be one of 'tumor', 'normal', or 'somatic'." = sample_type %in% c("tumor", "normal", "somatic"))
+  sample_name <- basename(sample_folder)
+  
+  html_file <- file.path(
+    sample_folder,
+    "QC_REPORT",
+    sprintf("%s_%s_%s_statistics.html", modality, sample_name, tolower(modality))
+  )
+  
+  # decide based on modality and sample_type which content to read from html
+  html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality))
+  table_number <- if_else(sample_type == "somatic", 1, 2)
+  columns_to_fix <- if(sample_type == 'somatic') c() else c('SNVs', 'Indels', 'Total')
+
+  tables <- read_html(html_file) |>
+    html_elements(html_section) |>
+    html_elements("table") |>
+    html_table(na.strings = "N/A")
+  tes <- tables[table_number] |>
+    lapply(function(df) {
+      colnames(df) <- make.names(colnames(df))
+      colnames(df)[1] <- "metric"
+      df |>
+        mutate(across(all_of(columns_to_fix), fix_thousands_separator))
+    }) |>
+    bind_rows() |>
+    pivot_longer(-metric, names_to = "mut_type", values_to = "value") |>
+    mutate(sample = sample_name) |>
+    mutate(var_name = sprintf("%s (%s)", metric, mut_type)) |>
+    select(sample, var_name, value) |>
+    pivot_wider(id_cols = sample, names_from = "var_name", values_from = "value") |>
+    mutate(across(contains("Number"), fix_thousands_separator))
+}
+
+#' Read in (unfiltered) small variant data from VCF files from personalis folders
+#' @param sample_list A vector of paths to personalis folders
+#' @return SummarizedExperiment
+#' @export
+read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
+  stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
+
+  read_all <- function(sample, ...) {
+    list(
+      vcf_data = read_personalis_vcf_files_sample(sample, ...),
+      summary_stats = read_personalis_variant_calling_summary_statistics(sample, ...)
+    )
+  }
+  
+  variant_list <- read_samples(
+    sample_paths,
+    read_all,
+    sprintf("%s small variant VCF data (%s)", modality, sample_type),
+    modality = modality,
+    sample_type = sample_type
+  )
+  
+  if (!length(variant_list)) {
+    return(NULL)
+  }
+  
+  col_data <- map(variant_list, "summary_stats") |>
+    bind_rows() |>
+    tibble::column_to_rownames("sample")
+
+  all_variants <- map(variant_list, "vcf_data") |> bind_rows()
+  row_data <- all_variants |>
+    select(
+      mut_id,
+      CHROM, 
+      POS, 
+      REF,
+      ALT
+    ) |>
+    distinct()
+  stopifnot("mut_id is not a unique identifier" = !anyDuplicated(row_data$mut_id))
+  
+  # ignore columns that are already in `row_data` except for the `mut_id` identifier
+  row_data_cols <- colnames(row_data)[colnames(row_data) != "mut_id"]
+  all_variants <- all_variants[, !(colnames(all_variants) %in% row_data_cols)]
+  all_variants <- add_dummy_entry(all_variants, col_data)
+  variants_bm <- splitAsBumpyMatrix(
+    all_variants,
+    row = all_variants$mut_id,
+    column = all_variants$sample,
+    sparse = TRUE
+  )
+  # sort row and col data to be consistent with BM
+  row_data <- tibble::column_to_rownames(row_data, "mut_id")[rownames(variants_bm), ]
+  col_data <- col_data[colnames(variants_bm), ]
+  
+  se <- SummarizedExperiment(
+    assays = list(
+      variants = variants_bm
+    ),
+    rowData = row_data,
+    colData = col_data
+  )
+  
+}
+
+#' Read Personalis (raw) VCF files for a provided folders/multiple samples and merge to one table.
+#'
+#' @param sample_folder base path to personalis folders
+#' @param sample_type Can be one or multiple of of 'tumor', 'normal', or 'somatic'.
+#'   'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
+#'   'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
+#'   'somatic' refers to tumor vs. normal (i.e. somatic mutations only).
+#' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA'
+#' @return data frame with all variants
+read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_type) {
+  stopifnot("`modality` must be one of 'DNA'/'dna' or 'RNA'/'rna'." = modality %in% c("DNA", "RNA", "dna", "rna"))
+  stopifnot("`sample_type` must be in 'tumor', 'normal', or 'somatic'." = sample_type %in% c("tumor", "normal", "somatic"))
+  sample_name <- basename(sample_folder)
+  # the prefix of the normal filename ends with N rather than T, e.g. K13T -> tumor, K13N -> normal
+  # even though they are all in the K13T folder.
+  tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name)
+  
+  # tumor DNA VCF files are gzipped, not totally clear if this rule applies to all cases
+  file_ending <- if_else(sample_type == "tumor" & modality == "DNA", "vcf.gz", "vcf")
+  
+  variant_table <- parse_vcf_to_df(
+    file.path(
+      sample_folder,
+      sprintf("%s_Pipeline", modality),
+      "Variants", 
+      sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending
+        )
+      )
+    ) %>% 
+    mutate(sample = sample_name) %>%
+    mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
+  
+  variant_table
+}
+
 
 #
 # -------------------- CNV --------------------------
@@ -291,6 +475,14 @@ read_personalis_cnv_reports <- function(sample_paths) {
     return(NULL)
   }
 
+  col_data <- bind_rows(map(cnv_list, "summary_stats"))
+  if (nrow(col_data)) {
+    col_data <- col_data |> tibble::column_to_rownames("sample")
+  }
+
+  all_cnv <- bind_rows(map(cnv_list, "cnv_report"))
+  row_data <- all_cnv |>
+    select(cnv_id, `Gene Symbol`, `Chromosome`, `Segment Start`, `Segment End`) |>
   col_data <- bind_rows(map(cnv_list, "summary_stats")) |>
     tibble::column_to_rownames("sample")
 
@@ -339,11 +531,11 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
     "Gene Symbol" = as.character,
     "CNA Type" = as.character,
     "AbsoluteCN" = as.numeric,
-    "Sequence" = as.character,
+    "Chromosome" = as.character,
     "Segment Start" = as.numeric,
     "Segment End" = as.numeric,
-    "Estimated Sample purity" = as.numeric,
-    "Estimated Sample Ploidy" = as.numeric,
+    # "Estimated Sample purity" = as.numeric,
+    # "Estimated Sample Ploidy" = as.numeric,
     "Percent of Gene in Event" = \(x) as.numeric(sub("%", "", x))
   )
   suppressWarnings({
@@ -353,9 +545,13 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
       # we also can't specify the columns at import time, because in some personalis versions, some columns
       # are omitted.
       amp = read_excel(cnv_file, sheet = "AMP", col_types = NULL) |>
+        # In older reports the "Chromosome" column is called sequence
+        rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
         select(-any_of(c("log posterior probability", "B-allele Frequency", "Allelotype", "Mean_log2Ratio"))) |>
         mutate(across(names(COL_TYPES), \(x) COL_TYPES[[cur_column()]](x))),
       del = read_excel(cnv_file, sheet = "DEL", col_types = NULL) |>
+        # In older reports the "Chromosome" column is called sequence
+        rename_with(\(x) if_else(x == "Sequence", "Chromosome", x)) |>
         select(-any_of(c("Wilcoxon pvalue", "KS pvalue"))) |>
         mutate(across(names(COL_TYPES), \(x) COL_TYPES[[cur_column()]](x)))
     )
@@ -368,7 +564,7 @@ read_personalis_cnv_report_sample <- function(sample_folder) {
     cnv_table <- cnv_table |>
       mutate(sample = sample_name) |>
       # if a segment spans multiple genes, there will be multiple rows per gene
-      mutate(cnv_id = sprintf("%s_%i_%i_%s", Sequence, `Segment Start`, `Segment End`, `Gene Symbol`))
+      mutate(cnv_id = sprintf("%s_%i_%i_%s", Chromosome, `Segment Start`, `Segment End`, `Gene Symbol`))
   }
 
   cnv_table
@@ -386,18 +582,24 @@ read_personalis_cnv_summary_statistics <- function(sample_folder) {
     sprintf("DNA_%s_dna_statistics.html", sample_name)
   )
   # unfortunately, this is not a table, but a div of divs that looks like a table
-  table <- (read_html(html_file) |> html_elements("#copy_number"))[[1]]
-  titles <- table |>
-    html_nodes(".title") |>
-    html_text()
-  values <- table |>
-    html_nodes(".value") |>
-    html_text()
-
-  cnv_metrics <- tibble(metric = titles[1:5], value = values[1:5]) |>
-    mutate(sample = sample_name) |>
-    pivot_wider(id_cols = sample, names_from = metric, values_from = value)
-  cnv_metrics
+  table <- (read_html(html_file) |> html_elements("#copy_number"))
+  # unfortunately, it seems missing in newer versions of the report
+  if (!length(table)) {
+    return(tibble())
+  } else {
+    table <- table[[1]]
+    titles <- table |>
+      html_nodes(".title") |>
+      html_text()
+    values <- table |>
+      html_nodes(".value") |>
+      html_text()
+
+    cnv_metrics <- tibble(metric = titles[1:5], value = values[1:5]) |>
+      mutate(sample = sample_name) |>
+      pivot_wider(id_cols = sample, names_from = metric, values_from = value)
+    cnv_metrics
+  }
 }
 
 
diff --git a/R/util.R b/R/util.R
index ce14b0a..1528b9e 100644
--- a/R/util.R
+++ b/R/util.R
@@ -114,3 +114,31 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
     dummy_entries
   )
 }
+
+#' Parse VCF files for a provided path and construct data frame.
+#'
+#' @param path path to VCF file in `*.vcf` or `*.vcf.gz` format
+#' @return {tibble} new data frame with all variants (fixed field and genotype information)
+#' @importFrom dplyr mutate left_join
+#' @importFrom vcfR read.vcfR vcfR2tidy
+#' @importFrom stringr str_split_i
+parse_vcf_to_df <- function(path) {
+  # parse VCF file
+  vcf_content <- read.vcfR(path)
+  
+  # fixed field content to data frame
+  fixed_df <- vcfR2tidy(vcf_content)$fix
+  
+  # GT content to data frame
+  gt_df <- vcfR2tidy(vcf_content)$gt
+  
+  # create addition column with observed nucleotides in order to avoid collisions when we do the left_join
+  gt_df <- gt_df %>%
+    dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2))
+  
+  # next use ChromKey, POS and ALT for joining vcf content data frames
+  joined_vcf_df <- fixed_df %>%
+    dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))
+  
+  as_tibble(joined_vcf_df)
+}

From eb2328a54608eab338f60dc3dd36b132da4a1740 Mon Sep 17 00:00:00 2001
From: grst <grst@users.noreply.github.com>
Date: Thu, 7 Mar 2024 13:27:20 +0000
Subject: [PATCH 2/9] Roxygenize

---
 NAMESPACE                                     |  6 ++++
 man/parse_vcf_to_df.Rd                        | 17 ++++++++++
 man/read_personalis.Rd                        |  6 +++-
 ..._personalis_small_variant_report_sample.Rd | 12 ++++---
 man/read_personalis_small_variant_reports.Rd  |  9 ++++-
 ...alis_variant_calling_summary_statistics.Rd | 33 +++++++++++++++++++
 man/read_personalis_vcf_files.Rd              | 17 ++++++++++
 man/read_personalis_vcf_files_sample.Rd       | 24 ++++++++++++++
 8 files changed, 117 insertions(+), 7 deletions(-)
 create mode 100644 man/parse_vcf_to_df.Rd
 create mode 100644 man/read_personalis_variant_calling_summary_statistics.Rd
 create mode 100644 man/read_personalis_vcf_files.Rd
 create mode 100644 man/read_personalis_vcf_files_sample.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 7cff0c9..99bf535 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,6 +7,7 @@ export(read_personalis_hla_reports)
 export(read_personalis_msi_reports)
 export(read_personalis_small_variant_reports)
 export(read_personalis_tcr_reports)
+export(read_personalis_vcf_files)
 importFrom(MultiAssayExperiment,MultiAssayExperiment)
 importFrom(SummarizedExperiment,SummarizedExperiment)
 importFrom(dplyr,across)
@@ -15,6 +16,7 @@ importFrom(dplyr,bind_rows)
 importFrom(dplyr,cur_column)
 importFrom(dplyr,distinct)
 importFrom(dplyr,if_else)
+importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,select)
 importFrom(purrr,keep)
@@ -25,6 +27,10 @@ importFrom(rvest,html_nodes)
 importFrom(rvest,html_table)
 importFrom(rvest,html_text)
 importFrom(rvest,read_html)
+importFrom(stringr,str_split_i)
+importFrom(stringr,str_to_title)
 importFrom(tibble,tibble)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
+importFrom(vcfR,read.vcfR)
+importFrom(vcfR,vcfR2tidy)
diff --git a/man/parse_vcf_to_df.Rd b/man/parse_vcf_to_df.Rd
new file mode 100644
index 0000000..1f667a9
--- /dev/null
+++ b/man/parse_vcf_to_df.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/util.R
+\name{parse_vcf_to_df}
+\alias{parse_vcf_to_df}
+\title{Parse VCF files for a provided path and construct data frame.}
+\usage{
+parse_vcf_to_df(path)
+}
+\arguments{
+\item{path}{path to VCF file in \verb{*.vcf} or \verb{*.vcf.gz} format}
+}
+\value{
+{tibble} new data frame with all variants (fixed field and genotype information)
+}
+\description{
+Parse VCF files for a provided path and construct data frame.
+}
diff --git a/man/read_personalis.Rd b/man/read_personalis.Rd
index 40e43ab..f9c8e7d 100644
--- a/man/read_personalis.Rd
+++ b/man/read_personalis.Rd
@@ -4,10 +4,14 @@
 \alias{read_personalis}
 \title{Read a set of personalis results folders into a MultiAssayExperiment object}
 \usage{
-read_personalis(pathlist)
+read_personalis(pathlist, report_type = "preferred", read_vcf_data = FALSE)
 }
 \arguments{
 \item{pathlist}{List with paths to Personalis results folders}
+
+\item{report_type}{Type of small variant report that should be read (default: preferred)}
+
+\item{read_vcf_data}{Boolean to specify if VCF file should be read in as well (default: FALSE)}
 }
 \value{
 MultiAssayExperiment
diff --git a/man/read_personalis_small_variant_report_sample.Rd b/man/read_personalis_small_variant_report_sample.Rd
index a4f9bc8..167f544 100644
--- a/man/read_personalis_small_variant_report_sample.Rd
+++ b/man/read_personalis_small_variant_report_sample.Rd
@@ -7,7 +7,8 @@
 read_personalis_small_variant_report_sample(
   sample_folder,
   modality,
-  sample_type
+  sample_type,
+  report_type
 )
 }
 \arguments{
@@ -15,7 +16,9 @@ read_personalis_small_variant_report_sample(
 
 \item{modality}{modality from which the variants were called. Can be either 'DNA' or 'RNA'}
 
-\item{sample_types}{Can be ome or multiple of of 'tumor', 'normal', or 'somatic'.
+\item{report_type}{report_type which should be read. Can be either 'preferred' or 'all'}
+
+\item{sample_types}{Can be one or multiple of of 'tumor', 'normal', or 'somatic'.
 'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
 'somatic' refers to tumor vs. normal (i.e. somatic mutations only).}
@@ -36,8 +39,7 @@ COSMIC, the default transcript would be the one corresponding to the longest CDS
 We also do not read the \code{cancer_clinical_research}, the \code{cancer_research} and the \code{lowpupulationfreq} reports,
 because they are subsets of the full report.
 
-In addition, Personalis also provides raw VCF files with all (unfiltered) variants. We currently don't
-read them in because without additional filtering they are not very useful. If you need this level of information,
-feel free to start from the raw vcf files or even run your own variant calling based on the FASTQ files.
+In addition, Personalis also provides raw VCF files with all (unfiltered) variants. If you need this level of information,
+please specify read_vcf_data=TRUE in your read_personalis() call.
 }
 \keyword{internal}
diff --git a/man/read_personalis_small_variant_reports.Rd b/man/read_personalis_small_variant_reports.Rd
index 82c7b45..dbfb9f0 100644
--- a/man/read_personalis_small_variant_reports.Rd
+++ b/man/read_personalis_small_variant_reports.Rd
@@ -5,7 +5,12 @@
 \title{Read in small variant data from personalis folder
 We only read in the "Preferred Transcript" report here:}
 \usage{
-read_personalis_small_variant_reports(sample_paths, modality, sample_type)
+read_personalis_small_variant_reports(
+  sample_paths,
+  modality,
+  sample_type,
+  report_type
+)
 }
 \arguments{
 \item{sample_paths}{A vector of paths to personalis folders}
@@ -16,6 +21,8 @@ read_personalis_small_variant_reports(sample_paths, modality, sample_type)
 'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
 'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
 'somatic' refers to tumor vs. normal (i.e. somatic mutations only).}
+
+\item{report_type}{report_type which should be read. Can be either 'preferred' or 'all'}
 }
 \value{
 SummarizedExperiment
diff --git a/man/read_personalis_variant_calling_summary_statistics.Rd b/man/read_personalis_variant_calling_summary_statistics.Rd
new file mode 100644
index 0000000..463cecf
--- /dev/null
+++ b/man/read_personalis_variant_calling_summary_statistics.Rd
@@ -0,0 +1,33 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/personalis.R
+\name{read_personalis_variant_calling_summary_statistics}
+\alias{read_personalis_variant_calling_summary_statistics}
+\title{Read HTML report on variant calling summary statistics and store information
+based on given modality and sample_type:
+DNA -> Normal DNA or Tumor DNA "Summary Small Variants" table
+RNA -> Tumor RNA "Summary Small Variants" table
+somatic -> "Tumor/Normal Concordance" table}
+\usage{
+read_personalis_variant_calling_summary_statistics(
+  sample_folder,
+  modality,
+  sample_type
+)
+}
+\arguments{
+\item{sample_folder}{path to personalis folder}
+
+\item{modality}{modality from which the variants were called. Can be either 'DNA' or 'RNA'}
+
+\item{sample_type}{Can be one or multiple of of 'tumor', 'normal', or 'somatic'.
+'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
+'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
+'somatic' refers to tumor vs. normal (i.e. somatic mutations only).}
+}
+\description{
+Read HTML report on variant calling summary statistics and store information
+based on given modality and sample_type:
+DNA -> Normal DNA or Tumor DNA "Summary Small Variants" table
+RNA -> Tumor RNA "Summary Small Variants" table
+somatic -> "Tumor/Normal Concordance" table
+}
diff --git a/man/read_personalis_vcf_files.Rd b/man/read_personalis_vcf_files.Rd
new file mode 100644
index 0000000..361f23a
--- /dev/null
+++ b/man/read_personalis_vcf_files.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/personalis.R
+\name{read_personalis_vcf_files}
+\alias{read_personalis_vcf_files}
+\title{Read in (unfiltered) small variant data from VCF files from personalis folders}
+\usage{
+read_personalis_vcf_files(sample_paths, modality, sample_type)
+}
+\arguments{
+\item{sample_list}{A vector of paths to personalis folders}
+}
+\value{
+SummarizedExperiment
+}
+\description{
+Read in (unfiltered) small variant data from VCF files from personalis folders
+}
diff --git a/man/read_personalis_vcf_files_sample.Rd b/man/read_personalis_vcf_files_sample.Rd
new file mode 100644
index 0000000..7af2d74
--- /dev/null
+++ b/man/read_personalis_vcf_files_sample.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/personalis.R
+\name{read_personalis_vcf_files_sample}
+\alias{read_personalis_vcf_files_sample}
+\title{Read Personalis (raw) VCF files for a provided folders/multiple samples and merge to one table.}
+\usage{
+read_personalis_vcf_files_sample(sample_folder, modality, sample_type)
+}
+\arguments{
+\item{sample_folder}{base path to personalis folders}
+
+\item{modality}{modality from which the variants were called. Can be either 'DNA' or 'RNA'}
+
+\item{sample_type}{Can be one or multiple of of 'tumor', 'normal', or 'somatic'.
+'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
+'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
+'somatic' refers to tumor vs. normal (i.e. somatic mutations only).}
+}
+\value{
+data frame with all variants
+}
+\description{
+Read Personalis (raw) VCF files for a provided folders/multiple samples and merge to one table.
+}

From 04bb27ebeca68416c6cb69771afee9e72b7b762a Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Thu, 7 Mar 2024 14:29:24 +0100
Subject: [PATCH 3/9] trigger ci


From 7c9503ad8ebb2ec2a6a7151e3c8dda5e0c5f6923 Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Mon, 11 Mar 2024 09:15:09 +0100
Subject: [PATCH 4/9] add missing dependencies

---
 DESCRIPTION | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 324de69..92302b8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -17,7 +17,9 @@ Depends:
     purrr,
     dplyr,
     BumpyMatrix,
-    rvest
+    rvest,
+    stringr, 
+    vcfR
 Suggests: 
     knitr,
     rmarkdown,

From 6ae3f7fbb1526a437d9fafb6f4e419ed121a5585 Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Mon, 11 Mar 2024 09:25:19 +0100
Subject: [PATCH 5/9] Depends -> Imports

---
 DESCRIPTION    | 4 ++--
 R/personalis.R | 7 ++++++-
 R/util.R       | 2 ++
 3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 92302b8..01b03a1 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -7,7 +7,8 @@ Description: This package provides convenience functions for reading real-world
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
-Depends:
+Imports:
+    dplyr
     SummarizedExperiment,
     readxl,
     MultiAssayExperiment,
@@ -15,7 +16,6 @@ Depends:
     pbapply,
     tidyr,
     purrr,
-    dplyr,
     BumpyMatrix,
     rvest,
     stringr, 
diff --git a/R/personalis.R b/R/personalis.R
index 6d9b5ed..079a218 100644
--- a/R/personalis.R
+++ b/R/personalis.R
@@ -351,7 +351,12 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
 }
 
 #' Read in (unfiltered) small variant data from VCF files from personalis folders
-#' @param sample_list A vector of paths to personalis folders
+#' @param sample_paths A vector of paths to personalis folders
+#' @param sample_type Can be one or multiple of of 'tumor', 'normal', or 'somatic'.
+#'   'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
+#'   'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
+#'   'somatic' refers to tumor vs. normal (i.e. somatic mutations only).
+#' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA'
 #' @return SummarizedExperiment
 #' @export
 read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
diff --git a/R/util.R b/R/util.R
index 1528b9e..e75b128 100644
--- a/R/util.R
+++ b/R/util.R
@@ -98,6 +98,7 @@ read_samples <- function(sample_paths, io_func, description, ...) {
 #' @param col_data {data.frame} data frame that is used as colData (must have rownames that are sample identifiers!)
 #' @param sample_col {character} column in `df` that contains the sample identifier
 #' @return {tibble} new data frame with dummy entries added
+#' @importFrom tibble as_tibble
 #' @keywords internal
 add_dummy_entry <- function(df, col_data, sample_col = "sample") {
   missing_samples <- setdiff(rownames(col_data), unique(df[[sample_col]]))
@@ -122,6 +123,7 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
 #' @importFrom dplyr mutate left_join
 #' @importFrom vcfR read.vcfR vcfR2tidy
 #' @importFrom stringr str_split_i
+#' @importFrom tibble as_tibble
 parse_vcf_to_df <- function(path) {
   # parse VCF file
   vcf_content <- read.vcfR(path)

From 9bf8134395d13f0846a735bb7c38dd8741ba654a Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Mon, 11 Mar 2024 09:27:13 +0100
Subject: [PATCH 6/9] missing comma

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 01b03a1..0a44a79 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -8,7 +8,7 @@ Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.1
 Imports:
-    dplyr
+    dplyr,
     SummarizedExperiment,
     readxl,
     MultiAssayExperiment,

From 9c2d32f42930b0e98725dd5f0807400baf70aecd Mon Sep 17 00:00:00 2001
From: grst <grst@users.noreply.github.com>
Date: Mon, 11 Mar 2024 08:28:42 +0000
Subject: [PATCH 7/9] Roxygenize

---
 NAMESPACE                        | 1 +
 man/read_personalis_vcf_files.Rd | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/NAMESPACE b/NAMESPACE
index 99bf535..2181dee 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -29,6 +29,7 @@ importFrom(rvest,html_text)
 importFrom(rvest,read_html)
 importFrom(stringr,str_split_i)
 importFrom(stringr,str_to_title)
+importFrom(tibble,as_tibble)
 importFrom(tibble,tibble)
 importFrom(tidyr,pivot_longer)
 importFrom(tidyr,pivot_wider)
diff --git a/man/read_personalis_vcf_files.Rd b/man/read_personalis_vcf_files.Rd
index 361f23a..df2e755 100644
--- a/man/read_personalis_vcf_files.Rd
+++ b/man/read_personalis_vcf_files.Rd
@@ -7,7 +7,14 @@
 read_personalis_vcf_files(sample_paths, modality, sample_type)
 }
 \arguments{
-\item{sample_list}{A vector of paths to personalis folders}
+\item{sample_paths}{A vector of paths to personalis folders}
+
+\item{modality}{modality from which the variants were called. Can be either 'DNA' or 'RNA'}
+
+\item{sample_type}{Can be one or multiple of of 'tumor', 'normal', or 'somatic'.
+'tumor' refers to tumor sample vs. genome reference (i.e. somatic+germline mutations),
+'normal' refers to normal sample vs. genome reference (i.e. germline mutations) and
+'somatic' refers to tumor vs. normal (i.e. somatic mutations only).}
 }
 \value{
 SummarizedExperiment

From a14e7d09abddabaa24861fc5fb6c66f1059fb760 Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Mon, 11 Mar 2024 09:30:26 +0100
Subject: [PATCH 8/9] trigger ci


From e9576db1af6e8e8fa0f95299bb123cad4ec68451 Mon Sep 17 00:00:00 2001
From: Gregor Sturm <gregor.sturm@boehringer-ingelheim.com>
Date: Mon, 11 Mar 2024 14:13:19 +0100
Subject: [PATCH 9/9] Fix missing imports

---
 DESCRIPTION    |  2 +-
 NAMESPACE      |  4 ++++
 R/personalis.R | 55 ++++++++++++++++++++++++++------------------------
 R/util.R       | 14 ++++++-------
 4 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 0a44a79..7eabd69 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: PersonalisIO
 Title: Read Personalis data into MultiAssayExperiment objects
-Version: 0.2.0.9000
+Version: 0.3.0.9000
 Authors@R: 
     person("Gregor", "Sturm", , "gregor.sturm@boehringer-ingelheim.com", role = c("aut", "cre"))
 Description: This package provides convenience functions for reading real-world evidence data provided by Personalis into Bioconductor MultiAssayExperiment objects.
diff --git a/NAMESPACE b/NAMESPACE
index 2181dee..72157b1 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,16 +8,20 @@ export(read_personalis_msi_reports)
 export(read_personalis_small_variant_reports)
 export(read_personalis_tcr_reports)
 export(read_personalis_vcf_files)
+importFrom(BumpyMatrix,splitAsBumpyMatrix)
 importFrom(MultiAssayExperiment,MultiAssayExperiment)
 importFrom(SummarizedExperiment,SummarizedExperiment)
 importFrom(dplyr,across)
+importFrom(dplyr,all_of)
 importFrom(dplyr,any_of)
 importFrom(dplyr,bind_rows)
+importFrom(dplyr,contains)
 importFrom(dplyr,cur_column)
 importFrom(dplyr,distinct)
 importFrom(dplyr,if_else)
 importFrom(dplyr,left_join)
 importFrom(dplyr,mutate)
+importFrom(dplyr,rename_with)
 importFrom(dplyr,select)
 importFrom(purrr,keep)
 importFrom(purrr,map)
diff --git a/R/personalis.R b/R/personalis.R
index 079a218..eb6780b 100644
--- a/R/personalis.R
+++ b/R/personalis.R
@@ -9,9 +9,9 @@ GUESS_MAX <- 21474836 # largest value without warning - this is anyway more than
 #' @param read_vcf_data Boolean to specify if VCF file should be read in as well (default: FALSE)
 #' @return MultiAssayExperiment
 #' @export
-read_personalis <- function(pathlist, report_type="preferred", read_vcf_data=FALSE) {
+read_personalis <- function(pathlist, report_type = "preferred", read_vcf_data = FALSE) {
   stopifnot("`report_type` must be one of 'preferred' or 'all'." = report_type %in% c("preferred", "all"))
-  
+
   experiments <- list()
   message(">>> Reading gene expression...")
   experiments[["gene_expression"]] <- read_personalis_gene_expression(pathlist)
@@ -32,7 +32,7 @@ read_personalis <- function(pathlist, report_type="preferred", read_vcf_data=FAL
     message("Reading DNA small variant data (normal) from VCF files... ")
     experiments[["dna_small_variants_normal_vcf"]] <- read_personalis_vcf_files(pathlist, "DNA", "normal")
   }
-  
+
   message(">>> Reading RNA small variant data... ")
   message("Reading RNA small variant data (somatic)... ")
   experiments[["rna_small_variants_somatic"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "somatic", report_type)
@@ -40,7 +40,7 @@ read_personalis <- function(pathlist, report_type="preferred", read_vcf_data=FAL
   experiments[["rna_small_variants_tumor"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "tumor", report_type)
   message("Reading RNA small variant data (normal)... ")
   experiments[["rna_small_variants_normal"]] <- read_personalis_small_variant_reports(pathlist, "RNA", "normal", report_type)
-  
+
   # for RNA we only have tumor or somatic variant VCF data
   if (read_vcf_data) {
     message("Reading RNA small variant data (somatic) from VCF files... ")
@@ -142,6 +142,7 @@ read_personalis_gene_expression_sample <- function(sample_folder) {
 #' @param report_type report_type which should be read. Can be either 'preferred' or 'all'
 #' @return SummarizedExperiment
 #' @importFrom dplyr select distinct bind_rows
+#' @importFrom BumpyMatrix splitAsBumpyMatrix
 #' @importFrom purrr map
 #' @export
 read_personalis_small_variant_reports <- function(sample_paths, modality, sample_type, report_type) {
@@ -228,7 +229,7 @@ read_personalis_small_variant_reports <- function(sample_paths, modality, sample
 #' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA'
 #' @param report_type report_type which should be read. Can be either 'preferred' or 'all'
 #' @return data frame with all variants
-#' @importFrom dplyr mutate
+#' @importFrom dplyr mutate rename_with
 #' @keywords internal
 read_personalis_small_variant_report_sample <- function(sample_folder, modality, sample_type, report_type) {
   stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
@@ -237,7 +238,7 @@ read_personalis_small_variant_report_sample <- function(sample_folder, modality,
   sample_name <- basename(sample_folder)
   # determine report folder based on parameter
   report_folder <- ifelse(report_type == "preferred", "Preferred_Transcripts", "All_Transcripts")
-  
+
   # the prefix of the normal filename ends with N rather than T, e.g. K13T -> tumor, K13N -> normal
   # even though they are all in the K13T folder.
   tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name)
@@ -314,21 +315,22 @@ read_personalis_somatic_variants_summary_statistics <- function(sample_folder, m
 #' @importFrom rvest read_html html_elements html_nodes html_text
 #' @importFrom tidyr pivot_longer
 #' @importFrom stringr str_to_title
+#' @importFrom dplyr all_of across contains
 read_personalis_variant_calling_summary_statistics <- function(sample_folder, modality, sample_type) {
   stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
   stopifnot("`sample_type` must be one of 'tumor', 'normal', or 'somatic'." = sample_type %in% c("tumor", "normal", "somatic"))
   sample_name <- basename(sample_folder)
-  
+
   html_file <- file.path(
     sample_folder,
     "QC_REPORT",
     sprintf("%s_%s_%s_statistics.html", modality, sample_name, tolower(modality))
   )
-  
+
   # decide based on modality and sample_type which content to read from html
   html_section <- if_else(sample_type == "somatic", "#concordance", sprintf("#%s_%s", str_to_title(sample_type), modality))
   table_number <- if_else(sample_type == "somatic", 1, 2)
-  columns_to_fix <- if(sample_type == 'somatic') c() else c('SNVs', 'Indels', 'Total')
+  columns_to_fix <- if (sample_type == "somatic") c() else c("SNVs", "Indels", "Total")
 
   tables <- read_html(html_file) |>
     html_elements(html_section) |>
@@ -358,6 +360,7 @@ read_personalis_variant_calling_summary_statistics <- function(sample_folder, mo
 #'   'somatic' refers to tumor vs. normal (i.e. somatic mutations only).
 #' @param modality modality from which the variants were called. Can be either 'DNA' or 'RNA'
 #' @return SummarizedExperiment
+#' @importFrom BumpyMatrix splitAsBumpyMatrix
 #' @export
 read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
   stopifnot("`modality` must be one of 'DNA' or 'RNA'." = modality %in% c("DNA", "RNA"))
@@ -368,7 +371,7 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
       summary_stats = read_personalis_variant_calling_summary_statistics(sample, ...)
     )
   }
-  
+
   variant_list <- read_samples(
     sample_paths,
     read_all,
@@ -376,11 +379,11 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
     modality = modality,
     sample_type = sample_type
   )
-  
+
   if (!length(variant_list)) {
     return(NULL)
   }
-  
+
   col_data <- map(variant_list, "summary_stats") |>
     bind_rows() |>
     tibble::column_to_rownames("sample")
@@ -389,14 +392,14 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
   row_data <- all_variants |>
     select(
       mut_id,
-      CHROM, 
-      POS, 
+      CHROM,
+      POS,
       REF,
       ALT
     ) |>
     distinct()
   stopifnot("mut_id is not a unique identifier" = !anyDuplicated(row_data$mut_id))
-  
+
   # ignore columns that are already in `row_data` except for the `mut_id` identifier
   row_data_cols <- colnames(row_data)[colnames(row_data) != "mut_id"]
   all_variants <- all_variants[, !(colnames(all_variants) %in% row_data_cols)]
@@ -410,7 +413,7 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
   # sort row and col data to be consistent with BM
   row_data <- tibble::column_to_rownames(row_data, "mut_id")[rownames(variants_bm), ]
   col_data <- col_data[colnames(variants_bm), ]
-  
+
   se <- SummarizedExperiment(
     assays = list(
       variants = variants_bm
@@ -418,7 +421,6 @@ read_personalis_vcf_files <- function(sample_paths, modality, sample_type) {
     rowData = row_data,
     colData = col_data
   )
-  
 }
 
 #' Read Personalis (raw) VCF files for a provided folders/multiple samples and merge to one table.
@@ -437,22 +439,21 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ
   # the prefix of the normal filename ends with N rather than T, e.g. K13T -> tumor, K13N -> normal
   # even though they are all in the K13T folder.
   tmp_sample_name <- if_else(sample_type == "normal", sub("T$", "N", sample_name), sample_name)
-  
+
   # tumor DNA VCF files are gzipped, not totally clear if this rule applies to all cases
   file_ending <- if_else(sample_type == "tumor" & modality == "DNA", "vcf.gz", "vcf")
-  
+
   variant_table <- parse_vcf_to_df(
     file.path(
       sample_folder,
       sprintf("%s_Pipeline", modality),
-      "Variants", 
-      sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending
-        )
-      )
-    ) %>% 
-    mutate(sample = sample_name) %>%
+      "Variants",
+      sprintf("%s_%s_%s_%s.%s", modality, tmp_sample_name, sample_type, tolower(modality), file_ending)
+    )
+  ) |>
+    mutate(sample = sample_name) |>
     mutate(mut_id = sprintf("%s_%s_%s_%s", CHROM, POS, REF, ALT))
-  
+
   variant_table
 }
 
@@ -466,6 +467,7 @@ read_personalis_vcf_files_sample <- function(sample_folder, modality, sample_typ
 #' @param sample_paths List of directories with Personalis samples
 #' @importFrom purrr map
 #' @importFrom dplyr bind_rows
+#' @importFrom BumpyMatrix splitAsBumpyMatrix
 #' @export
 read_personalis_cnv_reports <- function(sample_paths) {
   read_all <- function(path) {
@@ -655,6 +657,7 @@ read_personalis_hla_sample <- function(sample_folder) {
 #' @return SummarizedExperiment
 #' @export
 #' @importFrom purrr map
+#' @importFrom BumpyMatrix splitAsBumpyMatrix
 read_personalis_tcr_reports <- function(sample_paths) {
   read_all <- function(path) {
     list(
diff --git a/R/util.R b/R/util.R
index e75b128..292492e 100644
--- a/R/util.R
+++ b/R/util.R
@@ -127,20 +127,20 @@ add_dummy_entry <- function(df, col_data, sample_col = "sample") {
 parse_vcf_to_df <- function(path) {
   # parse VCF file
   vcf_content <- read.vcfR(path)
-  
+
   # fixed field content to data frame
   fixed_df <- vcfR2tidy(vcf_content)$fix
-  
+
   # GT content to data frame
   gt_df <- vcfR2tidy(vcf_content)$gt
-  
+
   # create addition column with observed nucleotides in order to avoid collisions when we do the left_join
-  gt_df <- gt_df %>%
+  gt_df <- gt_df |>
     dplyr::mutate(ALT = str_split_i(gt_GT_alleles, "/", 2))
-  
+
   # next use ChromKey, POS and ALT for joining vcf content data frames
-  joined_vcf_df <- fixed_df %>%
+  joined_vcf_df <- fixed_df |>
     dplyr::left_join(gt_df, by = c("ChromKey", "POS", "ALT"))
-  
+
   as_tibble(joined_vcf_df)
 }